Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

1_128_e3_3e-5/adapter_config.json +39 -0
1_128_e3_3e-5/adapter_model.safetensors +3 -0
1_128_e3_3e-5/added_tokens.json +9 -0
1_128_e3_3e-5/all_results.json +9 -0
1_128_e3_3e-5/chat_template.jinja +62 -0
1_128_e3_3e-5/config.json +32 -0
1_128_e3_3e-5/merges.txt +0 -0
1_128_e3_3e-5/special_tokens_map.json +33 -0
1_128_e3_3e-5/tokenizer.json +0 -0
1_128_e3_3e-5/tokenizer_config.json +234 -0
1_128_e3_3e-5/train_results.json +9 -0
1_128_e3_3e-5/trainer_state.json +2003 -0
1_128_e3_3e-5/training_args.bin +3 -0
1_128_e3_3e-5/vocab.json +0 -0

1_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

1_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c90e6d84ddb72a130ea4600019e8e502791a1fc3fe13972d49120d64224d307f
+size 791751704

1_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

1_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0918875763098255e+18,
+    "train_loss": 0.5697678685996637,
+    "train_runtime": 1013.0729,
+    "train_samples": 14919,
+    "train_samples_per_second": 44.179,
+    "train_steps_per_second": 1.383
+}

1_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

1_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

1_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

1_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

1_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

1_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

1_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0918875763098255e+18,
+    "train_loss": 0.5697678685996637,
+    "train_runtime": 1013.0729,
+    "train_samples": 14919,
+    "train_samples_per_second": 44.179,
+    "train_steps_per_second": 1.383
+}

1_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2003 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1401,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.010718113612004287,
+      "grad_norm": 3.0887272357940674,
+      "learning_rate": 1.6901408450704227e-06,
+      "loss": 1.5277,
+      "step": 5
+    },
+    {
+      "epoch": 0.021436227224008574,
+      "grad_norm": 3.003511905670166,
+      "learning_rate": 3.8028169014084508e-06,
+      "loss": 1.5744,
+      "step": 10
+    },
+    {
+      "epoch": 0.03215434083601286,
+      "grad_norm": 1.1555603742599487,
+      "learning_rate": 5.915492957746478e-06,
+      "loss": 1.4557,
+      "step": 15
+    },
+    {
+      "epoch": 0.04287245444801715,
+      "grad_norm": 0.6423429846763611,
+      "learning_rate": 8.028169014084507e-06,
+      "loss": 1.3571,
+      "step": 20
+    },
+    {
+      "epoch": 0.05359056806002144,
+      "grad_norm": 0.5051082968711853,
+      "learning_rate": 1.0140845070422535e-05,
+      "loss": 1.3377,
+      "step": 25
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.4673773944377899,
+      "learning_rate": 1.2253521126760564e-05,
+      "loss": 1.3373,
+      "step": 30
+    },
+    {
+      "epoch": 0.07502679528403002,
+      "grad_norm": 0.5021124482154846,
+      "learning_rate": 1.436619718309859e-05,
+      "loss": 1.326,
+      "step": 35
+    },
+    {
+      "epoch": 0.0857449088960343,
+      "grad_norm": 0.4011324346065521,
+      "learning_rate": 1.6478873239436623e-05,
+      "loss": 1.3277,
+      "step": 40
+    },
+    {
+      "epoch": 0.09646302250803858,
+      "grad_norm": 0.3723503053188324,
+      "learning_rate": 1.8591549295774646e-05,
+      "loss": 1.3597,
+      "step": 45
+    },
+    {
+      "epoch": 0.10718113612004287,
+      "grad_norm": 0.3821893036365509,
+      "learning_rate": 2.0704225352112676e-05,
+      "loss": 1.248,
+      "step": 50
+    },
+    {
+      "epoch": 0.11789924973204716,
+      "grad_norm": 0.3873383104801178,
+      "learning_rate": 2.2816901408450703e-05,
+      "loss": 1.2794,
+      "step": 55
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3538777232170105,
+      "learning_rate": 2.4929577464788733e-05,
+      "loss": 1.2397,
+      "step": 60
+    },
+    {
+      "epoch": 0.13933547695605572,
+      "grad_norm": 0.364370733499527,
+      "learning_rate": 2.7042253521126763e-05,
+      "loss": 1.2362,
+      "step": 65
+    },
+    {
+      "epoch": 0.15005359056806003,
+      "grad_norm": 0.3963067829608917,
+      "learning_rate": 2.915492957746479e-05,
+      "loss": 1.1983,
+      "step": 70
+    },
+    {
+      "epoch": 0.1607717041800643,
+      "grad_norm": 0.3728579580783844,
+      "learning_rate": 2.9999623384301385e-05,
+      "loss": 1.1624,
+      "step": 75
+    },
+    {
+      "epoch": 0.1714898177920686,
+      "grad_norm": 0.3975795805454254,
+      "learning_rate": 2.9997321912408203e-05,
+      "loss": 1.186,
+      "step": 80
+    },
+    {
+      "epoch": 0.18220793140407288,
+      "grad_norm": 0.3780006766319275,
+      "learning_rate": 2.999292852020009e-05,
+      "loss": 1.1793,
+      "step": 85
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.40317609906196594,
+      "learning_rate": 2.9986443820494872e-05,
+      "loss": 1.1755,
+      "step": 90
+    },
+    {
+      "epoch": 0.20364415862808147,
+      "grad_norm": 0.3940582573413849,
+      "learning_rate": 2.9977868717819053e-05,
+      "loss": 1.1347,
+      "step": 95
+    },
+    {
+      "epoch": 0.21436227224008575,
+      "grad_norm": 0.4203241765499115,
+      "learning_rate": 2.9967204408281618e-05,
+      "loss": 1.1216,
+      "step": 100
+    },
+    {
+      "epoch": 0.22508038585209003,
+      "grad_norm": 0.3849189579486847,
+      "learning_rate": 2.99544523794072e-05,
+      "loss": 1.1345,
+      "step": 105
+    },
+    {
+      "epoch": 0.2357984994640943,
+      "grad_norm": 0.5029954314231873,
+      "learning_rate": 2.993961440992859e-05,
+      "loss": 1.1428,
+      "step": 110
+    },
+    {
+      "epoch": 0.2465166130760986,
+      "grad_norm": 0.4647366404533386,
+      "learning_rate": 2.992269256953862e-05,
+      "loss": 1.0825,
+      "step": 115
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5049901604652405,
+      "learning_rate": 2.9903689218601497e-05,
+      "loss": 1.0887,
+      "step": 120
+    },
+    {
+      "epoch": 0.2679528403001072,
+      "grad_norm": 0.4278225600719452,
+      "learning_rate": 2.9882607007823525e-05,
+      "loss": 1.023,
+      "step": 125
+    },
+    {
+      "epoch": 0.27867095391211144,
+      "grad_norm": 0.5523571372032166,
+      "learning_rate": 2.9859448877883407e-05,
+      "loss": 1.0886,
+      "step": 130
+    },
+    {
+      "epoch": 0.28938906752411575,
+      "grad_norm": 0.4693784713745117,
+      "learning_rate": 2.9834218059022027e-05,
+      "loss": 1.1008,
+      "step": 135
+    },
+    {
+      "epoch": 0.30010718113612006,
+      "grad_norm": 0.4722091555595398,
+      "learning_rate": 2.98069180705919e-05,
+      "loss": 1.0736,
+      "step": 140
+    },
+    {
+      "epoch": 0.3108252947481243,
+      "grad_norm": 0.5182387828826904,
+      "learning_rate": 2.9777552720566256e-05,
+      "loss": 1.0933,
+      "step": 145
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.5157468318939209,
+      "learning_rate": 2.9746126105007885e-05,
+      "loss": 1.0064,
+      "step": 150
+    },
+    {
+      "epoch": 0.3322615219721329,
+      "grad_norm": 0.6086942553520203,
+      "learning_rate": 2.9712642607497795e-05,
+      "loss": 1.013,
+      "step": 155
+    },
+    {
+      "epoch": 0.3429796355841372,
+      "grad_norm": 0.5298618674278259,
+      "learning_rate": 2.967710689852377e-05,
+      "loss": 1.0182,
+      "step": 160
+    },
+    {
+      "epoch": 0.3536977491961415,
+      "grad_norm": 0.5256823897361755,
+      "learning_rate": 2.9639523934828877e-05,
+      "loss": 0.9936,
+      "step": 165
+    },
+    {
+      "epoch": 0.36441586280814575,
+      "grad_norm": 0.5341989994049072,
+      "learning_rate": 2.9599898958720088e-05,
+      "loss": 0.9879,
+      "step": 170
+    },
+    {
+      "epoch": 0.37513397642015006,
+      "grad_norm": 0.5580112934112549,
+      "learning_rate": 2.9558237497337054e-05,
+      "loss": 0.9816,
+      "step": 175
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.5546361207962036,
+      "learning_rate": 2.951454536188112e-05,
+      "loss": 1.0271,
+      "step": 180
+    },
+    {
+      "epoch": 0.3965702036441586,
+      "grad_norm": 0.5890166759490967,
+      "learning_rate": 2.946882864680478e-05,
+      "loss": 1.0203,
+      "step": 185
+    },
+    {
+      "epoch": 0.40728831725616294,
+      "grad_norm": 0.5653389692306519,
+      "learning_rate": 2.9421093728961545e-05,
+      "loss": 0.9802,
+      "step": 190
+    },
+    {
+      "epoch": 0.4180064308681672,
+      "grad_norm": 0.6572825312614441,
+      "learning_rate": 2.9371347266716485e-05,
+      "loss": 0.9758,
+      "step": 195
+    },
+    {
+      "epoch": 0.4287245444801715,
+      "grad_norm": 0.5875082612037659,
+      "learning_rate": 2.9319596199017478e-05,
+      "loss": 0.9279,
+      "step": 200
+    },
+    {
+      "epoch": 0.43944265809217575,
+      "grad_norm": 0.680169939994812,
+      "learning_rate": 2.9265847744427305e-05,
+      "loss": 0.9596,
+      "step": 205
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.6569834351539612,
+      "learning_rate": 2.9210109400116773e-05,
+      "loss": 0.9432,
+      "step": 210
+    },
+    {
+      "epoch": 0.4608788853161844,
+      "grad_norm": 0.6418343782424927,
+      "learning_rate": 2.9152388940818966e-05,
+      "loss": 0.9818,
+      "step": 215
+    },
+    {
+      "epoch": 0.4715969989281886,
+      "grad_norm": 0.6328263878822327,
+      "learning_rate": 2.9092694417744764e-05,
+      "loss": 0.9972,
+      "step": 220
+    },
+    {
+      "epoch": 0.48231511254019294,
+      "grad_norm": 0.671103298664093,
+      "learning_rate": 2.903103415745982e-05,
+      "loss": 0.8934,
+      "step": 225
+    },
+    {
+      "epoch": 0.4930332261521972,
+      "grad_norm": 0.63865727186203,
+      "learning_rate": 2.8967416760723113e-05,
+      "loss": 0.8765,
+      "step": 230
+    },
+    {
+      "epoch": 0.5037513397642015,
+      "grad_norm": 0.655642569065094,
+      "learning_rate": 2.890185110128727e-05,
+      "loss": 0.8878,
+      "step": 235
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.7347690463066101,
+      "learning_rate": 2.883434632466077e-05,
+      "loss": 0.875,
+      "step": 240
+    },
+    {
+      "epoch": 0.5251875669882101,
+      "grad_norm": 0.695117712020874,
+      "learning_rate": 2.8764911846832326e-05,
+      "loss": 0.8948,
+      "step": 245
+    },
+    {
+      "epoch": 0.5359056806002144,
+      "grad_norm": 0.7315117120742798,
+      "learning_rate": 2.8693557352957418e-05,
+      "loss": 0.9388,
+      "step": 250
+    },
+    {
+      "epoch": 0.5466237942122186,
+      "grad_norm": 0.6662325263023376,
+      "learning_rate": 2.8620292796007404e-05,
+      "loss": 0.8706,
+      "step": 255
+    },
+    {
+      "epoch": 0.5573419078242229,
+      "grad_norm": 0.7294781804084778,
+      "learning_rate": 2.8545128395381175e-05,
+      "loss": 0.8584,
+      "step": 260
+    },
+    {
+      "epoch": 0.5680600214362272,
+      "grad_norm": 0.643936038017273,
+      "learning_rate": 2.8468074635479725e-05,
+      "loss": 0.8689,
+      "step": 265
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.7401193976402283,
+      "learning_rate": 2.8389142264243692e-05,
+      "loss": 0.9149,
+      "step": 270
+    },
+    {
+      "epoch": 0.5894962486602358,
+      "grad_norm": 0.7772344946861267,
+      "learning_rate": 2.830834229165418e-05,
+      "loss": 0.8159,
+      "step": 275
+    },
+    {
+      "epoch": 0.6002143622722401,
+      "grad_norm": 0.7441154718399048,
+      "learning_rate": 2.8225685988197018e-05,
+      "loss": 0.8516,
+      "step": 280
+    },
+    {
+      "epoch": 0.6109324758842444,
+      "grad_norm": 0.7265426516532898,
+      "learning_rate": 2.8141184883290684e-05,
+      "loss": 0.8496,
+      "step": 285
+    },
+    {
+      "epoch": 0.6216505894962486,
+      "grad_norm": 0.7406579256057739,
+      "learning_rate": 2.8054850763678107e-05,
+      "loss": 0.8442,
+      "step": 290
+    },
+    {
+      "epoch": 0.632368703108253,
+      "grad_norm": 0.7769656181335449,
+      "learning_rate": 2.796669567178258e-05,
+      "loss": 0.8525,
+      "step": 295
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.7615723013877869,
+      "learning_rate": 2.7876731904027994e-05,
+      "loss": 0.8502,
+      "step": 300
+    },
+    {
+      "epoch": 0.6538049303322615,
+      "grad_norm": 0.7499276995658875,
+      "learning_rate": 2.7784972009123682e-05,
+      "loss": 0.8079,
+      "step": 305
+    },
+    {
+      "epoch": 0.6645230439442658,
+      "grad_norm": 0.8167960047721863,
+      "learning_rate": 2.769142878631403e-05,
+      "loss": 0.7955,
+      "step": 310
+    },
+    {
+      "epoch": 0.6752411575562701,
+      "grad_norm": 0.7172262072563171,
+      "learning_rate": 2.759611528359316e-05,
+      "loss": 0.7914,
+      "step": 315
+    },
+    {
+      "epoch": 0.6859592711682744,
+      "grad_norm": 0.821480393409729,
+      "learning_rate": 2.7499044795884903e-05,
+      "loss": 0.7418,
+      "step": 320
+    },
+    {
+      "epoch": 0.6966773847802786,
+      "grad_norm": 0.7819322347640991,
+      "learning_rate": 2.740023086318836e-05,
+      "loss": 0.8093,
+      "step": 325
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.7632163166999817,
+      "learning_rate": 2.729968726868926e-05,
+      "loss": 0.8056,
+      "step": 330
+    },
+    {
+      "epoch": 0.7181136120042872,
+      "grad_norm": 0.8228573799133301,
+      "learning_rate": 2.719742803683737e-05,
+      "loss": 0.7894,
+      "step": 335
+    },
+    {
+      "epoch": 0.7288317256162915,
+      "grad_norm": 0.8302276730537415,
+      "learning_rate": 2.7093467431390292e-05,
+      "loss": 0.7707,
+      "step": 340
+    },
+    {
+      "epoch": 0.7395498392282959,
+      "grad_norm": 0.8125790953636169,
+      "learning_rate": 2.698781995342387e-05,
+      "loss": 0.7816,
+      "step": 345
+    },
+    {
+      "epoch": 0.7502679528403001,
+      "grad_norm": 0.747488796710968,
+      "learning_rate": 2.6880500339309487e-05,
+      "loss": 0.7731,
+      "step": 350
+    },
+    {
+      "epoch": 0.7609860664523044,
+      "grad_norm": 0.7809923887252808,
+      "learning_rate": 2.6771523558658536e-05,
+      "loss": 0.779,
+      "step": 355
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.8691802620887756,
+      "learning_rate": 2.6660904812234362e-05,
+      "loss": 0.735,
+      "step": 360
+    },
+    {
+      "epoch": 0.782422293676313,
+      "grad_norm": 0.893549919128418,
+      "learning_rate": 2.6548659529831983e-05,
+      "loss": 0.8121,
+      "step": 365
+    },
+    {
+      "epoch": 0.7931404072883173,
+      "grad_norm": 0.9593718647956848,
+      "learning_rate": 2.6434803368125823e-05,
+      "loss": 0.7941,
+      "step": 370
+    },
+    {
+      "epoch": 0.8038585209003215,
+      "grad_norm": 0.8554571270942688,
+      "learning_rate": 2.631935220848585e-05,
+      "loss": 0.7689,
+      "step": 375
+    },
+    {
+      "epoch": 0.8145766345123259,
+      "grad_norm": 0.9226988554000854,
+      "learning_rate": 2.620232215476231e-05,
+      "loss": 0.7713,
+      "step": 380
+    },
+    {
+      "epoch": 0.8252947481243301,
+      "grad_norm": 0.8403868079185486,
+      "learning_rate": 2.6083729531039478e-05,
+      "loss": 0.7161,
+      "step": 385
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.8821732401847839,
+      "learning_rate": 2.5963590879358675e-05,
+      "loss": 0.7215,
+      "step": 390
+    },
+    {
+      "epoch": 0.8467309753483387,
+      "grad_norm": 0.8705371618270874,
+      "learning_rate": 2.5841922957410875e-05,
+      "loss": 0.7329,
+      "step": 395
+    },
+    {
+      "epoch": 0.857449088960343,
+      "grad_norm": 0.873972475528717,
+      "learning_rate": 2.5718742736199245e-05,
+      "loss": 0.744,
+      "step": 400
+    },
+    {
+      "epoch": 0.8681672025723473,
+      "grad_norm": 0.8475459218025208,
+      "learning_rate": 2.5594067397671902e-05,
+      "loss": 0.7254,
+      "step": 405
+    },
+    {
+      "epoch": 0.8788853161843515,
+      "grad_norm": 0.9166616797447205,
+      "learning_rate": 2.5467914332325312e-05,
+      "loss": 0.6777,
+      "step": 410
+    },
+    {
+      "epoch": 0.8896034297963559,
+      "grad_norm": 0.9241714477539062,
+      "learning_rate": 2.534030113677849e-05,
+      "loss": 0.7371,
+      "step": 415
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.9209508895874023,
+      "learning_rate": 2.5211245611318572e-05,
+      "loss": 0.7243,
+      "step": 420
+    },
+    {
+      "epoch": 0.9110396570203644,
+      "grad_norm": 0.9531940817832947,
+      "learning_rate": 2.5080765757417888e-05,
+      "loss": 0.693,
+      "step": 425
+    },
+    {
+      "epoch": 0.9217577706323687,
+      "grad_norm": 0.9694628119468689,
+      "learning_rate": 2.4948879775223015e-05,
+      "loss": 0.7073,
+      "step": 430
+    },
+    {
+      "epoch": 0.932475884244373,
+      "grad_norm": 1.0857443809509277,
+      "learning_rate": 2.4815606061016113e-05,
+      "loss": 0.684,
+      "step": 435
+    },
+    {
+      "epoch": 0.9431939978563773,
+      "grad_norm": 0.9990167021751404,
+      "learning_rate": 2.4680963204648868e-05,
+      "loss": 0.6846,
+      "step": 440
+    },
+    {
+      "epoch": 0.9539121114683816,
+      "grad_norm": 0.9203422665596008,
+      "learning_rate": 2.454496998694949e-05,
+      "loss": 0.6949,
+      "step": 445
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.9075927138328552,
+      "learning_rate": 2.4407645377103056e-05,
+      "loss": 0.6958,
+      "step": 450
+    },
+    {
+      "epoch": 0.9753483386923901,
+      "grad_norm": 0.9560758471488953,
+      "learning_rate": 2.4269008530005523e-05,
+      "loss": 0.6689,
+      "step": 455
+    },
+    {
+      "epoch": 0.9860664523043944,
+      "grad_norm": 0.9292932152748108,
+      "learning_rate": 2.4129078783591937e-05,
+      "loss": 0.6569,
+      "step": 460
+    },
+    {
+      "epoch": 0.9967845659163987,
+      "grad_norm": 0.9375683665275574,
+      "learning_rate": 2.3987875656139015e-05,
+      "loss": 0.6234,
+      "step": 465
+    },
+    {
+      "epoch": 1.0064308681672025,
+      "grad_norm": 0.9741694927215576,
+      "learning_rate": 2.3845418843542636e-05,
+      "loss": 0.5924,
+      "step": 470
+    },
+    {
+      "epoch": 1.0171489817792068,
+      "grad_norm": 0.8870618939399719,
+      "learning_rate": 2.3701728216570526e-05,
+      "loss": 0.574,
+      "step": 475
+    },
+    {
+      "epoch": 1.0278670953912112,
+      "grad_norm": 0.926276445388794,
+      "learning_rate": 2.3556823818090574e-05,
+      "loss": 0.6145,
+      "step": 480
+    },
+    {
+      "epoch": 1.0385852090032155,
+      "grad_norm": 1.0420550107955933,
+      "learning_rate": 2.3410725860275092e-05,
+      "loss": 0.5587,
+      "step": 485
+    },
+    {
+      "epoch": 1.0493033226152197,
+      "grad_norm": 0.9387816190719604,
+      "learning_rate": 2.3263454721781537e-05,
+      "loss": 0.5542,
+      "step": 490
+    },
+    {
+      "epoch": 1.060021436227224,
+      "grad_norm": 0.9477695822715759,
+      "learning_rate": 2.3115030944909946e-05,
+      "loss": 0.5623,
+      "step": 495
+    },
+    {
+      "epoch": 1.0707395498392283,
+      "grad_norm": 1.0408620834350586,
+      "learning_rate": 2.296547523273756e-05,
+      "loss": 0.5846,
+      "step": 500
+    },
+    {
+      "epoch": 1.0814576634512325,
+      "grad_norm": 0.8958512544631958,
+      "learning_rate": 2.2814808446231047e-05,
+      "loss": 0.5283,
+      "step": 505
+    },
+    {
+      "epoch": 1.092175777063237,
+      "grad_norm": 0.9695740938186646,
+      "learning_rate": 2.266305160133668e-05,
+      "loss": 0.5534,
+      "step": 510
+    },
+    {
+      "epoch": 1.1028938906752412,
+      "grad_norm": 0.98930823802948,
+      "learning_rate": 2.2510225866048883e-05,
+      "loss": 0.5456,
+      "step": 515
+    },
+    {
+      "epoch": 1.1136120042872455,
+      "grad_norm": 0.9378038644790649,
+      "learning_rate": 2.2356352557457624e-05,
+      "loss": 0.5921,
+      "step": 520
+    },
+    {
+      "epoch": 1.1243301178992497,
+      "grad_norm": 0.9949179291725159,
+      "learning_rate": 2.220145313877493e-05,
+      "loss": 0.5601,
+      "step": 525
+    },
+    {
+      "epoch": 1.135048231511254,
+      "grad_norm": 0.9769328832626343,
+      "learning_rate": 2.2045549216341093e-05,
+      "loss": 0.5052,
+      "step": 530
+    },
+    {
+      "epoch": 1.1457663451232583,
+      "grad_norm": 0.9389580488204956,
+      "learning_rate": 2.1888662536610878e-05,
+      "loss": 0.5548,
+      "step": 535
+    },
+    {
+      "epoch": 1.1564844587352625,
+      "grad_norm": 0.9294964075088501,
+      "learning_rate": 2.1730814983120185e-05,
+      "loss": 0.5304,
+      "step": 540
+    },
+    {
+      "epoch": 1.167202572347267,
+      "grad_norm": 1.0978158712387085,
+      "learning_rate": 2.1572028573433597e-05,
+      "loss": 0.5773,
+      "step": 545
+    },
+    {
+      "epoch": 1.1779206859592712,
+      "grad_norm": 1.0091325044631958,
+      "learning_rate": 2.1412325456073242e-05,
+      "loss": 0.5252,
+      "step": 550
+    },
+    {
+      "epoch": 1.1886387995712755,
+      "grad_norm": 1.0558208227157593,
+      "learning_rate": 2.1251727907429357e-05,
+      "loss": 0.5077,
+      "step": 555
+    },
+    {
+      "epoch": 1.1993569131832797,
+      "grad_norm": 1.1253631114959717,
+      "learning_rate": 2.1090258328653068e-05,
+      "loss": 0.5602,
+      "step": 560
+    },
+    {
+      "epoch": 1.210075026795284,
+      "grad_norm": 0.9343472123146057,
+      "learning_rate": 2.092793924253171e-05,
+      "loss": 0.532,
+      "step": 565
+    },
+    {
+      "epoch": 1.2207931404072883,
+      "grad_norm": 1.0394799709320068,
+      "learning_rate": 2.0764793290347223e-05,
+      "loss": 0.5257,
+      "step": 570
+    },
+    {
+      "epoch": 1.2315112540192925,
+      "grad_norm": 0.9338717460632324,
+      "learning_rate": 2.0600843228718007e-05,
+      "loss": 0.5452,
+      "step": 575
+    },
+    {
+      "epoch": 1.242229367631297,
+      "grad_norm": 0.9719412326812744,
+      "learning_rate": 2.0436111926424664e-05,
+      "loss": 0.5261,
+      "step": 580
+    },
+    {
+      "epoch": 1.2529474812433012,
+      "grad_norm": 1.0428558588027954,
+      "learning_rate": 2.0270622361220143e-05,
+      "loss": 0.5194,
+      "step": 585
+    },
+    {
+      "epoch": 1.2636655948553055,
+      "grad_norm": 0.9435470700263977,
+      "learning_rate": 2.0104397616624646e-05,
+      "loss": 0.488,
+      "step": 590
+    },
+    {
+      "epoch": 1.2743837084673098,
+      "grad_norm": 1.222658395767212,
+      "learning_rate": 1.9937460878705804e-05,
+      "loss": 0.4917,
+      "step": 595
+    },
+    {
+      "epoch": 1.285101822079314,
+      "grad_norm": 0.9916781187057495,
+      "learning_rate": 1.9769835432844523e-05,
+      "loss": 0.569,
+      "step": 600
+    },
+    {
+      "epoch": 1.2958199356913183,
+      "grad_norm": 1.005410075187683,
+      "learning_rate": 1.9601544660487004e-05,
+      "loss": 0.529,
+      "step": 605
+    },
+    {
+      "epoch": 1.3065380493033225,
+      "grad_norm": 0.9904552102088928,
+      "learning_rate": 1.9432612035883365e-05,
+      "loss": 0.5502,
+      "step": 610
+    },
+    {
+      "epoch": 1.317256162915327,
+      "grad_norm": 0.9880082607269287,
+      "learning_rate": 1.9263061122813267e-05,
+      "loss": 0.5261,
+      "step": 615
+    },
+    {
+      "epoch": 1.3279742765273312,
+      "grad_norm": 0.973067045211792,
+      "learning_rate": 1.9092915571299145e-05,
+      "loss": 0.5278,
+      "step": 620
+    },
+    {
+      "epoch": 1.3386923901393355,
+      "grad_norm": 1.0253452062606812,
+      "learning_rate": 1.8922199114307297e-05,
+      "loss": 0.4605,
+      "step": 625
+    },
+    {
+      "epoch": 1.3494105037513398,
+      "grad_norm": 1.1278691291809082,
+      "learning_rate": 1.875093556443751e-05,
+      "loss": 0.5462,
+      "step": 630
+    },
+    {
+      "epoch": 1.360128617363344,
+      "grad_norm": 1.059043288230896,
+      "learning_rate": 1.8579148810601506e-05,
+      "loss": 0.4737,
+      "step": 635
+    },
+    {
+      "epoch": 1.3708467309753483,
+      "grad_norm": 1.0425230264663696,
+      "learning_rate": 1.840686281469076e-05,
+      "loss": 0.457,
+      "step": 640
+    },
+    {
+      "epoch": 1.3815648445873525,
+      "grad_norm": 1.047011375427246,
+      "learning_rate": 1.8234101608234164e-05,
+      "loss": 0.5136,
+      "step": 645
+    },
+    {
+      "epoch": 1.392282958199357,
+      "grad_norm": 1.019850730895996,
+      "learning_rate": 1.8060889289045953e-05,
+      "loss": 0.4722,
+      "step": 650
+    },
+    {
+      "epoch": 1.4030010718113612,
+      "grad_norm": 0.8991992473602295,
+      "learning_rate": 1.788725001786438e-05,
+      "loss": 0.5307,
+      "step": 655
+    },
+    {
+      "epoch": 1.4137191854233655,
+      "grad_norm": 1.1788007020950317,
+      "learning_rate": 1.771320801498165e-05,
+      "loss": 0.4754,
+      "step": 660
+    },
+    {
+      "epoch": 1.4244372990353698,
+      "grad_norm": 1.0433146953582764,
+      "learning_rate": 1.7538787556865487e-05,
+      "loss": 0.4966,
+      "step": 665
+    },
+    {
+      "epoch": 1.435155412647374,
+      "grad_norm": 1.0864899158477783,
+      "learning_rate": 1.736401297277293e-05,
+      "loss": 0.436,
+      "step": 670
+    },
+    {
+      "epoch": 1.4458735262593785,
+      "grad_norm": 1.0527467727661133,
+      "learning_rate": 1.718890864135672e-05,
+      "loss": 0.4955,
+      "step": 675
+    },
+    {
+      "epoch": 1.4565916398713825,
+      "grad_norm": 0.9378105401992798,
+      "learning_rate": 1.7013498987264832e-05,
+      "loss": 0.4454,
+      "step": 680
+    },
+    {
+      "epoch": 1.467309753483387,
+      "grad_norm": 1.0318783521652222,
+      "learning_rate": 1.683780847773354e-05,
+      "loss": 0.4757,
+      "step": 685
+    },
+    {
+      "epoch": 1.4780278670953912,
+      "grad_norm": 1.0822546482086182,
+      "learning_rate": 1.6661861619174603e-05,
+      "loss": 0.4646,
+      "step": 690
+    },
+    {
+      "epoch": 1.4887459807073955,
+      "grad_norm": 1.0907413959503174,
+      "learning_rate": 1.6485682953756945e-05,
+      "loss": 0.4508,
+      "step": 695
+    },
+    {
+      "epoch": 1.4994640943193998,
+      "grad_norm": 1.0934693813323975,
+      "learning_rate": 1.6309297055983353e-05,
+      "loss": 0.4696,
+      "step": 700
+    },
+    {
+      "epoch": 1.510182207931404,
+      "grad_norm": 1.2902297973632812,
+      "learning_rate": 1.6132728529262695e-05,
+      "loss": 0.453,
+      "step": 705
+    },
+    {
+      "epoch": 1.5209003215434085,
+      "grad_norm": 1.0547277927398682,
+      "learning_rate": 1.5956002002478062e-05,
+      "loss": 0.4546,
+      "step": 710
+    },
+    {
+      "epoch": 1.5316184351554125,
+      "grad_norm": 1.058367133140564,
+      "learning_rate": 1.5779142126551372e-05,
+      "loss": 0.5181,
+      "step": 715
+    },
+    {
+      "epoch": 1.542336548767417,
+      "grad_norm": 0.9945155382156372,
+      "learning_rate": 1.5602173571004934e-05,
+      "loss": 0.4595,
+      "step": 720
+    },
+    {
+      "epoch": 1.5530546623794212,
+      "grad_norm": 1.053295373916626,
+      "learning_rate": 1.542512102052035e-05,
+      "loss": 0.4461,
+      "step": 725
+    },
+    {
+      "epoch": 1.5637727759914255,
+      "grad_norm": 1.1239274740219116,
+      "learning_rate": 1.524800917149538e-05,
+      "loss": 0.483,
+      "step": 730
+    },
+    {
+      "epoch": 1.5744908896034298,
+      "grad_norm": 1.2165591716766357,
+      "learning_rate": 1.5070862728599102e-05,
+      "loss": 0.4554,
+      "step": 735
+    },
+    {
+      "epoch": 1.585209003215434,
+      "grad_norm": 1.1676536798477173,
+      "learning_rate": 1.4893706401325978e-05,
+      "loss": 0.449,
+      "step": 740
+    },
+    {
+      "epoch": 1.5959271168274385,
+      "grad_norm": 1.1839724779129028,
+      "learning_rate": 1.4716564900549194e-05,
+      "loss": 0.457,
+      "step": 745
+    },
+    {
+      "epoch": 1.6066452304394425,
+      "grad_norm": 0.9654883146286011,
+      "learning_rate": 1.4539462935073841e-05,
+      "loss": 0.4448,
+      "step": 750
+    },
+    {
+      "epoch": 1.617363344051447,
+      "grad_norm": 1.1264455318450928,
+      "learning_rate": 1.4362425208190388e-05,
+      "loss": 0.425,
+      "step": 755
+    },
+    {
+      "epoch": 1.6280814576634512,
+      "grad_norm": 1.0239930152893066,
+      "learning_rate": 1.4185476414228896e-05,
+      "loss": 0.4439,
+      "step": 760
+    },
+    {
+      "epoch": 1.6387995712754555,
+      "grad_norm": 1.0360668897628784,
+      "learning_rate": 1.400864123511451e-05,
+      "loss": 0.4323,
+      "step": 765
+    },
+    {
+      "epoch": 1.6495176848874598,
+      "grad_norm": 1.0796645879745483,
+      "learning_rate": 1.3831944336924664e-05,
+      "loss": 0.4695,
+      "step": 770
+    },
+    {
+      "epoch": 1.660235798499464,
+      "grad_norm": 1.2254120111465454,
+      "learning_rate": 1.36554103664485e-05,
+      "loss": 0.4084,
+      "step": 775
+    },
+    {
+      "epoch": 1.6709539121114685,
+      "grad_norm": 1.1006470918655396,
+      "learning_rate": 1.3479063947748985e-05,
+      "loss": 0.4321,
+      "step": 780
+    },
+    {
+      "epoch": 1.6816720257234725,
+      "grad_norm": 1.0820367336273193,
+      "learning_rate": 1.33029296787282e-05,
+      "loss": 0.4448,
+      "step": 785
+    },
+    {
+      "epoch": 1.692390139335477,
+      "grad_norm": 0.962052583694458,
+      "learning_rate": 1.3127032127696238e-05,
+      "loss": 0.3765,
+      "step": 790
+    },
+    {
+      "epoch": 1.7031082529474812,
+      "grad_norm": 1.1583343744277954,
+      "learning_rate": 1.2951395829944309e-05,
+      "loss": 0.4514,
+      "step": 795
+    },
+    {
+      "epoch": 1.7138263665594855,
+      "grad_norm": 1.0421403646469116,
+      "learning_rate": 1.277604528432237e-05,
+      "loss": 0.4345,
+      "step": 800
+    },
+    {
+      "epoch": 1.72454448017149,
+      "grad_norm": 1.0921709537506104,
+      "learning_rate": 1.2601004949821887e-05,
+      "loss": 0.3865,
+      "step": 805
+    },
+    {
+      "epoch": 1.735262593783494,
+      "grad_norm": 1.1072250604629517,
+      "learning_rate": 1.2426299242164125e-05,
+      "loss": 0.4062,
+      "step": 810
+    },
+    {
+      "epoch": 1.7459807073954985,
+      "grad_norm": 0.9601326584815979,
+      "learning_rate": 1.2251952530394523e-05,
+      "loss": 0.3729,
+      "step": 815
+    },
+    {
+      "epoch": 1.7566988210075025,
+      "grad_norm": 1.044776439666748,
+      "learning_rate": 1.2077989133483506e-05,
+      "loss": 0.3919,
+      "step": 820
+    },
+    {
+      "epoch": 1.767416934619507,
+      "grad_norm": 1.214371919631958,
+      "learning_rate": 1.1904433316934363e-05,
+      "loss": 0.4077,
+      "step": 825
+    },
+    {
+      "epoch": 1.7781350482315113,
+      "grad_norm": 1.1787620782852173,
+      "learning_rate": 1.1731309289398509e-05,
+      "loss": 0.4086,
+      "step": 830
+    },
+    {
+      "epoch": 1.7888531618435155,
+      "grad_norm": 1.2586784362792969,
+      "learning_rate": 1.1558641199298728e-05,
+      "loss": 0.3981,
+      "step": 835
+    },
+    {
+      "epoch": 1.79957127545552,
+      "grad_norm": 1.162297248840332,
+      "learning_rate": 1.1386453131460803e-05,
+      "loss": 0.3964,
+      "step": 840
+    },
+    {
+      "epoch": 1.810289389067524,
+      "grad_norm": 1.202858567237854,
+      "learning_rate": 1.1214769103754011e-05,
+      "loss": 0.4095,
+      "step": 845
+    },
+    {
+      "epoch": 1.8210075026795285,
+      "grad_norm": 1.3784211874008179,
+      "learning_rate": 1.104361306374094e-05,
+      "loss": 0.405,
+      "step": 850
+    },
+    {
+      "epoch": 1.8317256162915327,
+      "grad_norm": 1.1413683891296387,
+      "learning_rate": 1.0873008885337162e-05,
+      "loss": 0.3835,
+      "step": 855
+    },
+    {
+      "epoch": 1.842443729903537,
+      "grad_norm": 1.101449728012085,
+      "learning_rate": 1.0702980365481143e-05,
+      "loss": 0.39,
+      "step": 860
+    },
+    {
+      "epoch": 1.8531618435155413,
+      "grad_norm": 1.161899447441101,
+      "learning_rate": 1.0533551220814889e-05,
+      "loss": 0.4405,
+      "step": 865
+    },
+    {
+      "epoch": 1.8638799571275455,
+      "grad_norm": 1.1374167203903198,
+      "learning_rate": 1.036474508437579e-05,
+      "loss": 0.3693,
+      "step": 870
+    },
+    {
+      "epoch": 1.87459807073955,
+      "grad_norm": 1.0448237657546997,
+      "learning_rate": 1.0196585502300169e-05,
+      "loss": 0.4154,
+      "step": 875
+    },
+    {
+      "epoch": 1.885316184351554,
+      "grad_norm": 1.2724683284759521,
+      "learning_rate": 1.0029095930538886e-05,
+      "loss": 0.3727,
+      "step": 880
+    },
+    {
+      "epoch": 1.8960342979635585,
+      "grad_norm": 1.1472162008285522,
+      "learning_rate": 9.862299731585581e-06,
+      "loss": 0.4096,
+      "step": 885
+    },
+    {
+      "epoch": 1.9067524115755627,
+      "grad_norm": 1.1890227794647217,
+      "learning_rate": 9.696220171217907e-06,
+      "loss": 0.4087,
+      "step": 890
+    },
+    {
+      "epoch": 1.917470525187567,
+      "grad_norm": 1.1401933431625366,
+      "learning_rate": 9.530880415252282e-06,
+      "loss": 0.3603,
+      "step": 895
+    },
+    {
+      "epoch": 1.9281886387995713,
+      "grad_norm": 1.0883911848068237,
+      "learning_rate": 9.366303526312582e-06,
+      "loss": 0.3789,
+      "step": 900
+    },
+    {
+      "epoch": 1.9389067524115755,
+      "grad_norm": 1.1235276460647583,
+      "learning_rate": 9.20251246061322e-06,
+      "loss": 0.3796,
+      "step": 905
+    },
+    {
+      "epoch": 1.94962486602358,
+      "grad_norm": 1.133574366569519,
+      "learning_rate": 9.039530064757047e-06,
+      "loss": 0.3886,
+      "step": 910
+    },
+    {
+      "epoch": 1.960342979635584,
+      "grad_norm": 1.122624158859253,
+      "learning_rate": 8.877379072548599e-06,
+      "loss": 0.4236,
+      "step": 915
+    },
+    {
+      "epoch": 1.9710610932475885,
+      "grad_norm": 1.1602848768234253,
+      "learning_rate": 8.71608210182303e-06,
+      "loss": 0.3688,
+      "step": 920
+    },
+    {
+      "epoch": 1.9817792068595927,
+      "grad_norm": 1.1969410181045532,
+      "learning_rate": 8.555661651291226e-06,
+      "loss": 0.3748,
+      "step": 925
+    },
+    {
+      "epoch": 1.992497320471597,
+      "grad_norm": 1.0584759712219238,
+      "learning_rate": 8.39614009740155e-06,
+      "loss": 0.4203,
+      "step": 930
+    },
+    {
+      "epoch": 2.002143622722401,
+      "grad_norm": 1.0526671409606934,
+      "learning_rate": 8.237539691218637e-06,
+      "loss": 0.3389,
+      "step": 935
+    },
+    {
+      "epoch": 2.012861736334405,
+      "grad_norm": 1.169420599937439,
+      "learning_rate": 8.079882555319685e-06,
+      "loss": 0.2913,
+      "step": 940
+    },
+    {
+      "epoch": 2.0235798499464095,
+      "grad_norm": 1.2719544172286987,
+      "learning_rate": 7.923190680708636e-06,
+      "loss": 0.3158,
+      "step": 945
+    },
+    {
+      "epoch": 2.0342979635584135,
+      "grad_norm": 0.9886611700057983,
+      "learning_rate": 7.767485923748754e-06,
+      "loss": 0.3127,
+      "step": 950
+    },
+    {
+      "epoch": 2.045016077170418,
+      "grad_norm": 1.1227387189865112,
+      "learning_rate": 7.612790003113949e-06,
+      "loss": 0.2765,
+      "step": 955
+    },
+    {
+      "epoch": 2.0557341907824225,
+      "grad_norm": 1.1321218013763428,
+      "learning_rate": 7.459124496759343e-06,
+      "loss": 0.3418,
+      "step": 960
+    },
+    {
+      "epoch": 2.0664523043944265,
+      "grad_norm": 1.1055247783660889,
+      "learning_rate": 7.30651083891141e-06,
+      "loss": 0.3159,
+      "step": 965
+    },
+    {
+      "epoch": 2.077170418006431,
+      "grad_norm": 1.1450848579406738,
+      "learning_rate": 7.154970317078214e-06,
+      "loss": 0.3193,
+      "step": 970
+    },
+    {
+      "epoch": 2.087888531618435,
+      "grad_norm": 1.1955426931381226,
+      "learning_rate": 7.0045240690800975e-06,
+      "loss": 0.3228,
+      "step": 975
+    },
+    {
+      "epoch": 2.0986066452304395,
+      "grad_norm": 1.1087688207626343,
+      "learning_rate": 6.85519308010123e-06,
+      "loss": 0.3161,
+      "step": 980
+    },
+    {
+      "epoch": 2.1093247588424435,
+      "grad_norm": 1.0273271799087524,
+      "learning_rate": 6.7069981797625005e-06,
+      "loss": 0.3124,
+      "step": 985
+    },
+    {
+      "epoch": 2.120042872454448,
+      "grad_norm": 1.1378201246261597,
+      "learning_rate": 6.559960039215999e-06,
+      "loss": 0.3218,
+      "step": 990
+    },
+    {
+      "epoch": 2.1307609860664525,
+      "grad_norm": 1.1623541116714478,
+      "learning_rate": 6.4140991682617676e-06,
+      "loss": 0.3021,
+      "step": 995
+    },
+    {
+      "epoch": 2.1414790996784565,
+      "grad_norm": 1.109506368637085,
+      "learning_rate": 6.2694359124868825e-06,
+      "loss": 0.2786,
+      "step": 1000
+    },
+    {
+      "epoch": 2.152197213290461,
+      "grad_norm": 1.1443182229995728,
+      "learning_rate": 6.12599045042759e-06,
+      "loss": 0.3026,
+      "step": 1005
+    },
+    {
+      "epoch": 2.162915326902465,
+      "grad_norm": 1.078079104423523,
+      "learning_rate": 5.983782790754624e-06,
+      "loss": 0.3064,
+      "step": 1010
+    },
+    {
+      "epoch": 2.1736334405144695,
+      "grad_norm": 1.052649974822998,
+      "learning_rate": 5.842832769482306e-06,
+      "loss": 0.3002,
+      "step": 1015
+    },
+    {
+      "epoch": 2.184351554126474,
+      "grad_norm": 1.3214424848556519,
+      "learning_rate": 5.703160047201675e-06,
+      "loss": 0.2909,
+      "step": 1020
+    },
+    {
+      "epoch": 2.195069667738478,
+      "grad_norm": 1.2034846544265747,
+      "learning_rate": 5.564784106338131e-06,
+      "loss": 0.3251,
+      "step": 1025
+    },
+    {
+      "epoch": 2.2057877813504825,
+      "grad_norm": 1.2458229064941406,
+      "learning_rate": 5.427724248433868e-06,
+      "loss": 0.3009,
+      "step": 1030
+    },
+    {
+      "epoch": 2.2165058949624865,
+      "grad_norm": 1.1213204860687256,
+      "learning_rate": 5.291999591455599e-06,
+      "loss": 0.2865,
+      "step": 1035
+    },
+    {
+      "epoch": 2.227224008574491,
+      "grad_norm": 1.0876282453536987,
+      "learning_rate": 5.157629067127857e-06,
+      "loss": 0.2958,
+      "step": 1040
+    },
+    {
+      "epoch": 2.237942122186495,
+      "grad_norm": 1.2595003843307495,
+      "learning_rate": 5.024631418292275e-06,
+      "loss": 0.3109,
+      "step": 1045
+    },
+    {
+      "epoch": 2.2486602357984995,
+      "grad_norm": 1.1336579322814941,
+      "learning_rate": 4.893025196293236e-06,
+      "loss": 0.3117,
+      "step": 1050
+    },
+    {
+      "epoch": 2.259378349410504,
+      "grad_norm": 1.175622582435608,
+      "learning_rate": 4.7628287583901695e-06,
+      "loss": 0.316,
+      "step": 1055
+    },
+    {
+      "epoch": 2.270096463022508,
+      "grad_norm": 1.1397731304168701,
+      "learning_rate": 4.6340602651970304e-06,
+      "loss": 0.3321,
+      "step": 1060
+    },
+    {
+      "epoch": 2.2808145766345125,
+      "grad_norm": 1.2002718448638916,
+      "learning_rate": 4.5067376781490855e-06,
+      "loss": 0.3078,
+      "step": 1065
+    },
+    {
+      "epoch": 2.2915326902465165,
+      "grad_norm": 1.1895092725753784,
+      "learning_rate": 4.380878756997584e-06,
+      "loss": 0.265,
+      "step": 1070
+    },
+    {
+      "epoch": 2.302250803858521,
+      "grad_norm": 1.1748250722885132,
+      "learning_rate": 4.256501057332468e-06,
+      "loss": 0.2896,
+      "step": 1075
+    },
+    {
+      "epoch": 2.312968917470525,
+      "grad_norm": 1.0516598224639893,
+      "learning_rate": 4.133621928133666e-06,
+      "loss": 0.2785,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3236870310825295,
+      "grad_norm": 1.0863022804260254,
+      "learning_rate": 4.012258509351105e-06,
+      "loss": 0.2786,
+      "step": 1085
+    },
+    {
+      "epoch": 2.334405144694534,
+      "grad_norm": 1.1100490093231201,
+      "learning_rate": 3.892427729513937e-06,
+      "loss": 0.2645,
+      "step": 1090
+    },
+    {
+      "epoch": 2.345123258306538,
+      "grad_norm": 1.0852088928222656,
+      "learning_rate": 3.7741463033692487e-06,
+      "loss": 0.2874,
+      "step": 1095
+    },
+    {
+      "epoch": 2.3558413719185425,
+      "grad_norm": 1.2049472332000732,
+      "learning_rate": 3.657430729550567e-06,
+      "loss": 0.3039,
+      "step": 1100
+    },
+    {
+      "epoch": 2.3665594855305465,
+      "grad_norm": 1.1067779064178467,
+      "learning_rate": 3.5422972882765557e-06,
+      "loss": 0.2861,
+      "step": 1105
+    },
+    {
+      "epoch": 2.377277599142551,
+      "grad_norm": 1.2369863986968994,
+      "learning_rate": 3.428762039080115e-06,
+      "loss": 0.2849,
+      "step": 1110
+    },
+    {
+      "epoch": 2.387995712754555,
+      "grad_norm": 1.100407600402832,
+      "learning_rate": 3.3168408185683153e-06,
+      "loss": 0.3062,
+      "step": 1115
+    },
+    {
+      "epoch": 2.3987138263665595,
+      "grad_norm": 1.0821059942245483,
+      "learning_rate": 3.206549238213403e-06,
+      "loss": 0.2769,
+      "step": 1120
+    },
+    {
+      "epoch": 2.409431939978564,
+      "grad_norm": 1.2019745111465454,
+      "learning_rate": 3.0979026821752297e-06,
+      "loss": 0.2872,
+      "step": 1125
+    },
+    {
+      "epoch": 2.420150053590568,
+      "grad_norm": 0.9468026161193848,
+      "learning_rate": 2.9909163051553473e-06,
+      "loss": 0.295,
+      "step": 1130
+    },
+    {
+      "epoch": 2.4308681672025725,
+      "grad_norm": 1.1377627849578857,
+      "learning_rate": 2.8856050302831527e-06,
+      "loss": 0.2722,
+      "step": 1135
+    },
+    {
+      "epoch": 2.4415862808145765,
+      "grad_norm": 1.1497541666030884,
+      "learning_rate": 2.781983547034307e-06,
+      "loss": 0.2657,
+      "step": 1140
+    },
+    {
+      "epoch": 2.452304394426581,
+      "grad_norm": 1.1820542812347412,
+      "learning_rate": 2.6800663091817767e-06,
+      "loss": 0.2529,
+      "step": 1145
+    },
+    {
+      "epoch": 2.463022508038585,
+      "grad_norm": 1.1529977321624756,
+      "learning_rate": 2.5798675327796994e-06,
+      "loss": 0.2759,
+      "step": 1150
+    },
+    {
+      "epoch": 2.4737406216505895,
+      "grad_norm": 1.1461573839187622,
+      "learning_rate": 2.4814011941804603e-06,
+      "loss": 0.2752,
+      "step": 1155
+    },
+    {
+      "epoch": 2.484458735262594,
+      "grad_norm": 0.9810699820518494,
+      "learning_rate": 2.384681028085181e-06,
+      "loss": 0.2762,
+      "step": 1160
+    },
+    {
+      "epoch": 2.495176848874598,
+      "grad_norm": 1.044024109840393,
+      "learning_rate": 2.2897205256279048e-06,
+      "loss": 0.2872,
+      "step": 1165
+    },
+    {
+      "epoch": 2.5058949624866025,
+      "grad_norm": 1.1893457174301147,
+      "learning_rate": 2.1965329324938e-06,
+      "loss": 0.2854,
+      "step": 1170
+    },
+    {
+      "epoch": 2.5166130760986065,
+      "grad_norm": 1.0847933292388916,
+      "learning_rate": 2.1051312470715175e-06,
+      "loss": 0.2922,
+      "step": 1175
+    },
+    {
+      "epoch": 2.527331189710611,
+      "grad_norm": 1.1810137033462524,
+      "learning_rate": 2.015528218640149e-06,
+      "loss": 0.265,
+      "step": 1180
+    },
+    {
+      "epoch": 2.538049303322615,
+      "grad_norm": 1.0281888246536255,
+      "learning_rate": 1.927736345590839e-06,
+      "loss": 0.2838,
+      "step": 1185
+    },
+    {
+      "epoch": 2.5487674169346195,
+      "grad_norm": 1.0966765880584717,
+      "learning_rate": 1.8417678736834604e-06,
+      "loss": 0.3181,
+      "step": 1190
+    },
+    {
+      "epoch": 2.559485530546624,
+      "grad_norm": 1.1331520080566406,
+      "learning_rate": 1.757634794338459e-06,
+      "loss": 0.2779,
+      "step": 1195
+    },
+    {
+      "epoch": 2.570203644158628,
+      "grad_norm": 1.120964765548706,
+      "learning_rate": 1.6753488429642638e-06,
+      "loss": 0.2884,
+      "step": 1200
+    },
+    {
+      "epoch": 2.5809217577706325,
+      "grad_norm": 1.1420046091079712,
+      "learning_rate": 1.594921497320324e-06,
+      "loss": 0.2797,
+      "step": 1205
+    },
+    {
+      "epoch": 2.5916398713826365,
+      "grad_norm": 1.1691547632217407,
+      "learning_rate": 1.516363975916128e-06,
+      "loss": 0.2825,
+      "step": 1210
+    },
+    {
+      "epoch": 2.602357984994641,
+      "grad_norm": 1.0935206413269043,
+      "learning_rate": 1.4396872364463853e-06,
+      "loss": 0.2599,
+      "step": 1215
+    },
+    {
+      "epoch": 2.613076098606645,
+      "grad_norm": 1.1766819953918457,
+      "learning_rate": 1.3649019742625624e-06,
+      "loss": 0.3039,
+      "step": 1220
+    },
+    {
+      "epoch": 2.6237942122186495,
+      "grad_norm": 1.1887208223342896,
+      "learning_rate": 1.292018620881048e-06,
+      "loss": 0.2869,
+      "step": 1225
+    },
+    {
+      "epoch": 2.634512325830654,
+      "grad_norm": 1.1644409894943237,
+      "learning_rate": 1.2210473425280783e-06,
+      "loss": 0.2885,
+      "step": 1230
+    },
+    {
+      "epoch": 2.645230439442658,
+      "grad_norm": 1.2318623065948486,
+      "learning_rate": 1.1519980387217032e-06,
+      "loss": 0.2926,
+      "step": 1235
+    },
+    {
+      "epoch": 2.6559485530546625,
+      "grad_norm": 1.1559414863586426,
+      "learning_rate": 1.0848803408909309e-06,
+      "loss": 0.3042,
+      "step": 1240
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 1.075410008430481,
+      "learning_rate": 1.019703611032292e-06,
+      "loss": 0.2863,
+      "step": 1245
+    },
+    {
+      "epoch": 2.677384780278671,
+      "grad_norm": 1.1436070203781128,
+      "learning_rate": 9.56476940403942e-07,
+      "loss": 0.275,
+      "step": 1250
+    },
+    {
+      "epoch": 2.688102893890675,
+      "grad_norm": 1.185698390007019,
+      "learning_rate": 8.952091482575825e-07,
+      "loss": 0.2804,
+      "step": 1255
+    },
+    {
+      "epoch": 2.6988210075026795,
+      "grad_norm": 1.2179303169250488,
+      "learning_rate": 8.359087806082761e-07,
+      "loss": 0.3004,
+      "step": 1260
+    },
+    {
+      "epoch": 2.709539121114684,
+      "grad_norm": 1.2889785766601562,
+      "learning_rate": 7.785841090424151e-07,
+      "loss": 0.2774,
+      "step": 1265
+    },
+    {
+      "epoch": 2.720257234726688,
+      "grad_norm": 1.172799825668335,
+      "learning_rate": 7.232431295639303e-07,
+      "loss": 0.258,
+      "step": 1270
+    },
+    {
+      "epoch": 2.7309753483386925,
+      "grad_norm": 1.1804120540618896,
+      "learning_rate": 6.698935614789676e-07,
+      "loss": 0.2829,
+      "step": 1275
+    },
+    {
+      "epoch": 2.7416934619506965,
+      "grad_norm": 1.1490246057510376,
+      "learning_rate": 6.185428463191512e-07,
+      "loss": 0.2829,
+      "step": 1280
+    },
+    {
+      "epoch": 2.752411575562701,
+      "grad_norm": 1.0342978239059448,
+      "learning_rate": 5.691981468035873e-07,
+      "loss": 0.2558,
+      "step": 1285
+    },
+    {
+      "epoch": 2.763129689174705,
+      "grad_norm": 0.9746224880218506,
+      "learning_rate": 5.218663458397716e-07,
+      "loss": 0.2701,
+      "step": 1290
+    },
+    {
+      "epoch": 2.7738478027867095,
+      "grad_norm": 1.1627681255340576,
+      "learning_rate": 4.765540455635037e-07,
+      "loss": 0.3193,
+      "step": 1295
+    },
+    {
+      "epoch": 2.784565916398714,
+      "grad_norm": 1.17820405960083,
+      "learning_rate": 4.332675664179975e-07,
+      "loss": 0.2819,
+      "step": 1300
+    },
+    {
+      "epoch": 2.795284030010718,
+      "grad_norm": 0.9921414852142334,
+      "learning_rate": 3.9201294627225605e-07,
+      "loss": 0.2562,
+      "step": 1305
+    },
+    {
+      "epoch": 2.8060021436227225,
+      "grad_norm": 1.149877905845642,
+      "learning_rate": 3.5279593957887213e-07,
+      "loss": 0.2906,
+      "step": 1310
+    },
+    {
+      "epoch": 2.816720257234727,
+      "grad_norm": 1.0398290157318115,
+      "learning_rate": 3.156220165713603e-07,
+      "loss": 0.2893,
+      "step": 1315
+    },
+    {
+      "epoch": 2.827438370846731,
+      "grad_norm": 1.1763765811920166,
+      "learning_rate": 2.8049636250114195e-07,
+      "loss": 0.2905,
+      "step": 1320
+    },
+    {
+      "epoch": 2.838156484458735,
+      "grad_norm": 1.200230360031128,
+      "learning_rate": 2.474238769142645e-07,
+      "loss": 0.2506,
+      "step": 1325
+    },
+    {
+      "epoch": 2.8488745980707395,
+      "grad_norm": 1.0414636135101318,
+      "learning_rate": 2.1640917296799069e-07,
+      "loss": 0.2513,
+      "step": 1330
+    },
+    {
+      "epoch": 2.859592711682744,
+      "grad_norm": 1.1210410594940186,
+      "learning_rate": 1.8745657678731875e-07,
+      "loss": 0.2751,
+      "step": 1335
+    },
+    {
+      "epoch": 2.870310825294748,
+      "grad_norm": 1.1431525945663452,
+      "learning_rate": 1.6057012686154836e-07,
+      "loss": 0.2908,
+      "step": 1340
+    },
+    {
+      "epoch": 2.8810289389067525,
+      "grad_norm": 1.0242949724197388,
+      "learning_rate": 1.357535734809795e-07,
+      "loss": 0.2896,
+      "step": 1345
+    },
+    {
+      "epoch": 2.891747052518757,
+      "grad_norm": 1.0368810892105103,
+      "learning_rate": 1.1301037821377758e-07,
+      "loss": 0.2749,
+      "step": 1350
+    },
+    {
+      "epoch": 2.902465166130761,
+      "grad_norm": 1.0779223442077637,
+      "learning_rate": 9.234371342314807e-08,
+      "loss": 0.2734,
+      "step": 1355
+    },
+    {
+      "epoch": 2.913183279742765,
+      "grad_norm": 1.1738801002502441,
+      "learning_rate": 7.375646182482875e-08,
+      "loss": 0.3016,
+      "step": 1360
+    },
+    {
+      "epoch": 2.9239013933547695,
+      "grad_norm": 0.9701231122016907,
+      "learning_rate": 5.725121608499639e-08,
+      "loss": 0.2292,
+      "step": 1365
+    },
+    {
+      "epoch": 2.934619506966774,
+      "grad_norm": 1.090383529663086,
+      "learning_rate": 4.283027845861598e-08,
+      "loss": 0.2655,
+      "step": 1370
+    },
+    {
+      "epoch": 2.945337620578778,
+      "grad_norm": 1.1737515926361084,
+      "learning_rate": 3.0495660468315975e-08,
+      "loss": 0.2342,
+      "step": 1375
+    },
+    {
+      "epoch": 2.9560557341907825,
+      "grad_norm": 1.060263752937317,
+      "learning_rate": 2.0249082623802762e-08,
+      "loss": 0.2814,
+      "step": 1380
+    },
+    {
+      "epoch": 2.966773847802787,
+      "grad_norm": 1.067541241645813,
+      "learning_rate": 1.2091974181880905e-08,
+      "loss": 0.2686,
+      "step": 1385
+    },
+    {
+      "epoch": 2.977491961414791,
+      "grad_norm": 1.249548077583313,
+      "learning_rate": 6.025472947082667e-09,
+      "loss": 0.2855,
+      "step": 1390
+    },
+    {
+      "epoch": 2.988210075026795,
+      "grad_norm": 1.3081293106079102,
+      "learning_rate": 2.0504251129649378e-09,
+      "loss": 0.2527,
+      "step": 1395
+    },
+    {
+      "epoch": 2.9989281886387995,
+      "grad_norm": 1.1735353469848633,
+      "learning_rate": 1.6738514407699246e-10,
+      "loss": 0.2382,
+      "step": 1400
+    },
+    {
+      "epoch": 3.0,
+      "step": 1401,
+      "total_flos": 2.0918875763098255e+18,
+      "train_loss": 0.5697678685996637,
+      "train_runtime": 1013.0729,
+      "train_samples_per_second": 44.179,
+      "train_steps_per_second": 1.383
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1401,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0918875763098255e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

1_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8d41448c15f291508ace36349b25b9dc9f2192192f986d94bbe9a1558f10bc6
+size 8209

1_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff