Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

instruct/12_128_e3_3e-5/adapter_config.json +39 -0
instruct/12_128_e3_3e-5/adapter_model.safetensors +3 -0
instruct/12_128_e3_3e-5/added_tokens.json +9 -0
instruct/12_128_e3_3e-5/all_results.json +9 -0
instruct/12_128_e3_3e-5/chat_template.jinja +62 -0
instruct/12_128_e3_3e-5/config.json +32 -0
instruct/12_128_e3_3e-5/merges.txt +0 -0
instruct/12_128_e3_3e-5/special_tokens_map.json +33 -0
instruct/12_128_e3_3e-5/tokenizer.json +0 -0
instruct/12_128_e3_3e-5/tokenizer_config.json +234 -0
instruct/12_128_e3_3e-5/train_results.json +9 -0
instruct/12_128_e3_3e-5/trainer_state.json +2276 -0
instruct/12_128_e3_3e-5/training_args.bin +3 -0
instruct/12_128_e3_3e-5/vocab.json +0 -0

instruct/12_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

instruct/12_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e247ec697ab43df592aa8e8ff6e38c3d08fc0cfcefd406dbcb60ef580ac4f2fe
+size 791751704

instruct/12_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

instruct/12_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0415991178298655e+18,
+    "train_loss": 0.36059863125452574,
+    "train_runtime": 988.9937,
+    "train_samples": 17017,
+    "train_samples_per_second": 51.619,
+    "train_steps_per_second": 1.614
+}

instruct/12_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

instruct/12_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

instruct/12_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

instruct/12_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

instruct/12_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

instruct/12_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

instruct/12_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0415991178298655e+18,
+    "train_loss": 0.36059863125452574,
+    "train_runtime": 988.9937,
+    "train_samples": 17017,
+    "train_samples_per_second": 51.619,
+    "train_steps_per_second": 1.614
+}

instruct/12_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2276 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1596,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009398496240601503,
+      "grad_norm": 2.502014398574829,
+      "learning_rate": 1.5e-06,
+      "loss": 1.5469,
+      "step": 5
+    },
+    {
+      "epoch": 0.018796992481203006,
+      "grad_norm": 2.0200090408325195,
+      "learning_rate": 3.3750000000000003e-06,
+      "loss": 1.5636,
+      "step": 10
+    },
+    {
+      "epoch": 0.02819548872180451,
+      "grad_norm": 1.2822911739349365,
+      "learning_rate": 5.25e-06,
+      "loss": 1.4923,
+      "step": 15
+    },
+    {
+      "epoch": 0.03759398496240601,
+      "grad_norm": 0.7315253019332886,
+      "learning_rate": 7.1249999999999995e-06,
+      "loss": 1.4596,
+      "step": 20
+    },
+    {
+      "epoch": 0.046992481203007516,
+      "grad_norm": 0.5335997343063354,
+      "learning_rate": 9e-06,
+      "loss": 1.3887,
+      "step": 25
+    },
+    {
+      "epoch": 0.05639097744360902,
+      "grad_norm": 0.6000903248786926,
+      "learning_rate": 1.0875e-05,
+      "loss": 1.415,
+      "step": 30
+    },
+    {
+      "epoch": 0.06578947368421052,
+      "grad_norm": 0.4487023651599884,
+      "learning_rate": 1.275e-05,
+      "loss": 1.38,
+      "step": 35
+    },
+    {
+      "epoch": 0.07518796992481203,
+      "grad_norm": 0.41073116660118103,
+      "learning_rate": 1.4625e-05,
+      "loss": 1.3887,
+      "step": 40
+    },
+    {
+      "epoch": 0.08458646616541353,
+      "grad_norm": 0.4798469841480255,
+      "learning_rate": 1.65e-05,
+      "loss": 1.3778,
+      "step": 45
+    },
+    {
+      "epoch": 0.09398496240601503,
+      "grad_norm": 0.42918503284454346,
+      "learning_rate": 1.8375000000000003e-05,
+      "loss": 1.4056,
+      "step": 50
+    },
+    {
+      "epoch": 0.10338345864661654,
+      "grad_norm": 0.5621869564056396,
+      "learning_rate": 2.025e-05,
+      "loss": 1.3789,
+      "step": 55
+    },
+    {
+      "epoch": 0.11278195488721804,
+      "grad_norm": 0.3957989513874054,
+      "learning_rate": 2.2125000000000002e-05,
+      "loss": 1.2872,
+      "step": 60
+    },
+    {
+      "epoch": 0.12218045112781954,
+      "grad_norm": 0.4491111636161804,
+      "learning_rate": 2.4e-05,
+      "loss": 1.332,
+      "step": 65
+    },
+    {
+      "epoch": 0.13157894736842105,
+      "grad_norm": 0.378256618976593,
+      "learning_rate": 2.5875000000000002e-05,
+      "loss": 1.2599,
+      "step": 70
+    },
+    {
+      "epoch": 0.14097744360902256,
+      "grad_norm": 0.4281750023365021,
+      "learning_rate": 2.7750000000000004e-05,
+      "loss": 1.3005,
+      "step": 75
+    },
+    {
+      "epoch": 0.15037593984962405,
+      "grad_norm": 0.4786468744277954,
+      "learning_rate": 2.9625000000000002e-05,
+      "loss": 1.2266,
+      "step": 80
+    },
+    {
+      "epoch": 0.15977443609022557,
+      "grad_norm": 0.46316206455230713,
+      "learning_rate": 2.999948467631686e-05,
+      "loss": 1.2327,
+      "step": 85
+    },
+    {
+      "epoch": 0.16917293233082706,
+      "grad_norm": 0.5043375492095947,
+      "learning_rate": 2.999739123453822e-05,
+      "loss": 1.2683,
+      "step": 90
+    },
+    {
+      "epoch": 0.17857142857142858,
+      "grad_norm": 0.5216848850250244,
+      "learning_rate": 2.99936876915103e-05,
+      "loss": 1.1855,
+      "step": 95
+    },
+    {
+      "epoch": 0.18796992481203006,
+      "grad_norm": 0.5720008611679077,
+      "learning_rate": 2.9988374444840858e-05,
+      "loss": 1.1781,
+      "step": 100
+    },
+    {
+      "epoch": 0.19736842105263158,
+      "grad_norm": 0.5401681661605835,
+      "learning_rate": 2.9981452064953454e-05,
+      "loss": 1.1784,
+      "step": 105
+    },
+    {
+      "epoch": 0.20676691729323307,
+      "grad_norm": 0.5164614915847778,
+      "learning_rate": 2.997292129502616e-05,
+      "loss": 1.1571,
+      "step": 110
+    },
+    {
+      "epoch": 0.2161654135338346,
+      "grad_norm": 0.5654135942459106,
+      "learning_rate": 2.9962783050911824e-05,
+      "loss": 1.1502,
+      "step": 115
+    },
+    {
+      "epoch": 0.22556390977443608,
+      "grad_norm": 0.6060947179794312,
+      "learning_rate": 2.9951038421039704e-05,
+      "loss": 1.1131,
+      "step": 120
+    },
+    {
+      "epoch": 0.2349624060150376,
+      "grad_norm": 0.5892205834388733,
+      "learning_rate": 2.9937688666298648e-05,
+      "loss": 1.0911,
+      "step": 125
+    },
+    {
+      "epoch": 0.24436090225563908,
+      "grad_norm": 0.5846742987632751,
+      "learning_rate": 2.9922735219901693e-05,
+      "loss": 1.0838,
+      "step": 130
+    },
+    {
+      "epoch": 0.25375939849624063,
+      "grad_norm": 0.711112916469574,
+      "learning_rate": 2.990617968723223e-05,
+      "loss": 1.0234,
+      "step": 135
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 0.631570041179657,
+      "learning_rate": 2.9888023845671632e-05,
+      "loss": 1.1093,
+      "step": 140
+    },
+    {
+      "epoch": 0.2725563909774436,
+      "grad_norm": 0.6685044765472412,
+      "learning_rate": 2.9868269644408445e-05,
+      "loss": 1.0641,
+      "step": 145
+    },
+    {
+      "epoch": 0.2819548872180451,
+      "grad_norm": 0.7012262940406799,
+      "learning_rate": 2.984691920422911e-05,
+      "loss": 1.0143,
+      "step": 150
+    },
+    {
+      "epoch": 0.29135338345864664,
+      "grad_norm": 0.8340694308280945,
+      "learning_rate": 2.9823974817290317e-05,
+      "loss": 0.99,
+      "step": 155
+    },
+    {
+      "epoch": 0.3007518796992481,
+      "grad_norm": 0.8734436631202698,
+      "learning_rate": 2.9799438946872865e-05,
+      "loss": 0.9719,
+      "step": 160
+    },
+    {
+      "epoch": 0.3101503759398496,
+      "grad_norm": 0.8449086546897888,
+      "learning_rate": 2.9773314227117265e-05,
+      "loss": 0.9679,
+      "step": 165
+    },
+    {
+      "epoch": 0.31954887218045114,
+      "grad_norm": 0.8639029860496521,
+      "learning_rate": 2.9745603462740886e-05,
+      "loss": 0.9634,
+      "step": 170
+    },
+    {
+      "epoch": 0.32894736842105265,
+      "grad_norm": 0.7578688263893127,
+      "learning_rate": 2.97163096287369e-05,
+      "loss": 0.9581,
+      "step": 175
+    },
+    {
+      "epoch": 0.3383458646616541,
+      "grad_norm": 0.7792065739631653,
+      "learning_rate": 2.9685435870054843e-05,
+      "loss": 0.9393,
+      "step": 180
+    },
+    {
+      "epoch": 0.34774436090225563,
+      "grad_norm": 1.0152794122695923,
+      "learning_rate": 2.9652985501263013e-05,
+      "loss": 0.8903,
+      "step": 185
+    },
+    {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 0.8871893882751465,
+      "learning_rate": 2.9618962006192595e-05,
+      "loss": 0.9388,
+      "step": 190
+    },
+    {
+      "epoch": 0.36654135338345867,
+      "grad_norm": 0.8481462597846985,
+      "learning_rate": 2.9583369037563655e-05,
+      "loss": 0.8553,
+      "step": 195
+    },
+    {
+      "epoch": 0.37593984962406013,
+      "grad_norm": 0.9429571628570557,
+      "learning_rate": 2.9546210416592988e-05,
+      "loss": 0.9058,
+      "step": 200
+    },
+    {
+      "epoch": 0.38533834586466165,
+      "grad_norm": 0.9846692085266113,
+      "learning_rate": 2.9507490132583873e-05,
+      "loss": 0.8096,
+      "step": 205
+    },
+    {
+      "epoch": 0.39473684210526316,
+      "grad_norm": 0.980230987071991,
+      "learning_rate": 2.946721234249779e-05,
+      "loss": 0.87,
+      "step": 210
+    },
+    {
+      "epoch": 0.4041353383458647,
+      "grad_norm": 1.0099208354949951,
+      "learning_rate": 2.9425381370508138e-05,
+      "loss": 0.8291,
+      "step": 215
+    },
+    {
+      "epoch": 0.41353383458646614,
+      "grad_norm": 0.939041256904602,
+      "learning_rate": 2.9382001707535976e-05,
+      "loss": 0.8555,
+      "step": 220
+    },
+    {
+      "epoch": 0.42293233082706766,
+      "grad_norm": 1.0197454690933228,
+      "learning_rate": 2.9337078010767914e-05,
+      "loss": 0.8518,
+      "step": 225
+    },
+    {
+      "epoch": 0.4323308270676692,
+      "grad_norm": 1.0676138401031494,
+      "learning_rate": 2.9290615103156095e-05,
+      "loss": 0.836,
+      "step": 230
+    },
+    {
+      "epoch": 0.4417293233082707,
+      "grad_norm": 1.1104499101638794,
+      "learning_rate": 2.924261797290043e-05,
+      "loss": 0.8404,
+      "step": 235
+    },
+    {
+      "epoch": 0.45112781954887216,
+      "grad_norm": 0.8881465196609497,
+      "learning_rate": 2.9193091772913064e-05,
+      "loss": 0.8172,
+      "step": 240
+    },
+    {
+      "epoch": 0.4605263157894737,
+      "grad_norm": 1.028106927871704,
+      "learning_rate": 2.9142041820265163e-05,
+      "loss": 0.7831,
+      "step": 245
+    },
+    {
+      "epoch": 0.4699248120300752,
+      "grad_norm": 0.9798471331596375,
+      "learning_rate": 2.908947359561607e-05,
+      "loss": 0.7552,
+      "step": 250
+    },
+    {
+      "epoch": 0.4793233082706767,
+      "grad_norm": 1.0129481554031372,
+      "learning_rate": 2.9035392742624943e-05,
+      "loss": 0.7903,
+      "step": 255
+    },
+    {
+      "epoch": 0.48872180451127817,
+      "grad_norm": 1.1476976871490479,
+      "learning_rate": 2.8979805067344808e-05,
+      "loss": 0.7736,
+      "step": 260
+    },
+    {
+      "epoch": 0.4981203007518797,
+      "grad_norm": 1.058274269104004,
+      "learning_rate": 2.8922716537599273e-05,
+      "loss": 0.7195,
+      "step": 265
+    },
+    {
+      "epoch": 0.5075187969924813,
+      "grad_norm": 1.0607396364212036,
+      "learning_rate": 2.8864133282341818e-05,
+      "loss": 0.7259,
+      "step": 270
+    },
+    {
+      "epoch": 0.5169172932330827,
+      "grad_norm": 1.0076299905776978,
+      "learning_rate": 2.8804061590997775e-05,
+      "loss": 0.7034,
+      "step": 275
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 0.9910840392112732,
+      "learning_rate": 2.874250791278913e-05,
+      "loss": 0.6738,
+      "step": 280
+    },
+    {
+      "epoch": 0.5357142857142857,
+      "grad_norm": 1.1556156873703003,
+      "learning_rate": 2.8679478856042137e-05,
+      "loss": 0.6827,
+      "step": 285
+    },
+    {
+      "epoch": 0.5451127819548872,
+      "grad_norm": 0.9208582043647766,
+      "learning_rate": 2.8614981187477845e-05,
+      "loss": 0.7085,
+      "step": 290
+    },
+    {
+      "epoch": 0.5545112781954887,
+      "grad_norm": 1.012955904006958,
+      "learning_rate": 2.8549021831485645e-05,
+      "loss": 0.6869,
+      "step": 295
+    },
+    {
+      "epoch": 0.5639097744360902,
+      "grad_norm": 1.103227972984314,
+      "learning_rate": 2.8481607869379873e-05,
+      "loss": 0.6031,
+      "step": 300
+    },
+    {
+      "epoch": 0.5733082706766918,
+      "grad_norm": 1.2159794569015503,
+      "learning_rate": 2.8412746538639556e-05,
+      "loss": 0.7008,
+      "step": 305
+    },
+    {
+      "epoch": 0.5827067669172933,
+      "grad_norm": 1.063978672027588,
+      "learning_rate": 2.8342445232131427e-05,
+      "loss": 0.665,
+      "step": 310
+    },
+    {
+      "epoch": 0.5921052631578947,
+      "grad_norm": 1.1001124382019043,
+      "learning_rate": 2.8270711497316208e-05,
+      "loss": 0.6327,
+      "step": 315
+    },
+    {
+      "epoch": 0.6015037593984962,
+      "grad_norm": 1.1076452732086182,
+      "learning_rate": 2.8197553035438365e-05,
+      "loss": 0.6637,
+      "step": 320
+    },
+    {
+      "epoch": 0.6109022556390977,
+      "grad_norm": 1.1827822923660278,
+      "learning_rate": 2.8122977700699263e-05,
+      "loss": 0.6087,
+      "step": 325
+    },
+    {
+      "epoch": 0.6203007518796992,
+      "grad_norm": 1.1983767747879028,
+      "learning_rate": 2.8046993499413982e-05,
+      "loss": 0.5684,
+      "step": 330
+    },
+    {
+      "epoch": 0.6296992481203008,
+      "grad_norm": 1.2548726797103882,
+      "learning_rate": 2.796960858915177e-05,
+      "loss": 0.6433,
+      "step": 335
+    },
+    {
+      "epoch": 0.6390977443609023,
+      "grad_norm": 1.2525805234909058,
+      "learning_rate": 2.7890831277860243e-05,
+      "loss": 0.6509,
+      "step": 340
+    },
+    {
+      "epoch": 0.6484962406015038,
+      "grad_norm": 1.1672039031982422,
+      "learning_rate": 2.781067002297344e-05,
+      "loss": 0.6039,
+      "step": 345
+    },
+    {
+      "epoch": 0.6578947368421053,
+      "grad_norm": 1.2225556373596191,
+      "learning_rate": 2.7729133430503884e-05,
+      "loss": 0.622,
+      "step": 350
+    },
+    {
+      "epoch": 0.6672932330827067,
+      "grad_norm": 1.2113149166107178,
+      "learning_rate": 2.7646230254118617e-05,
+      "loss": 0.6003,
+      "step": 355
+    },
+    {
+      "epoch": 0.6766917293233082,
+      "grad_norm": 1.285935640335083,
+      "learning_rate": 2.756196939419943e-05,
+      "loss": 0.5416,
+      "step": 360
+    },
+    {
+      "epoch": 0.6860902255639098,
+      "grad_norm": 1.180495023727417,
+      "learning_rate": 2.747635989688733e-05,
+      "loss": 0.604,
+      "step": 365
+    },
+    {
+      "epoch": 0.6954887218045113,
+      "grad_norm": 1.1116900444030762,
+      "learning_rate": 2.738941095311135e-05,
+      "loss": 0.556,
+      "step": 370
+    },
+    {
+      "epoch": 0.7048872180451128,
+      "grad_norm": 1.1285786628723145,
+      "learning_rate": 2.730113189760183e-05,
+      "loss": 0.6079,
+      "step": 375
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 1.2646759748458862,
+      "learning_rate": 2.721153220788826e-05,
+      "loss": 0.5678,
+      "step": 380
+    },
+    {
+      "epoch": 0.7236842105263158,
+      "grad_norm": 1.1523957252502441,
+      "learning_rate": 2.7120621503281756e-05,
+      "loss": 0.5305,
+      "step": 385
+    },
+    {
+      "epoch": 0.7330827067669173,
+      "grad_norm": 1.3809316158294678,
+      "learning_rate": 2.7028409543842378e-05,
+      "loss": 0.561,
+      "step": 390
+    },
+    {
+      "epoch": 0.7424812030075187,
+      "grad_norm": 1.1742485761642456,
+      "learning_rate": 2.6934906229331262e-05,
+      "loss": 0.5361,
+      "step": 395
+    },
+    {
+      "epoch": 0.7518796992481203,
+      "grad_norm": 1.324890375137329,
+      "learning_rate": 2.6840121598147845e-05,
+      "loss": 0.5614,
+      "step": 400
+    },
+    {
+      "epoch": 0.7612781954887218,
+      "grad_norm": 1.161738395690918,
+      "learning_rate": 2.6744065826252103e-05,
+      "loss": 0.5056,
+      "step": 405
+    },
+    {
+      "epoch": 0.7706766917293233,
+      "grad_norm": 1.1795762777328491,
+      "learning_rate": 2.6646749226072105e-05,
+      "loss": 0.4927,
+      "step": 410
+    },
+    {
+      "epoch": 0.7800751879699248,
+      "grad_norm": 1.2287588119506836,
+      "learning_rate": 2.6548182245396885e-05,
+      "loss": 0.5048,
+      "step": 415
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 1.1695654392242432,
+      "learning_rate": 2.6448375466254745e-05,
+      "loss": 0.5191,
+      "step": 420
+    },
+    {
+      "epoch": 0.7988721804511278,
+      "grad_norm": 1.2173978090286255,
+      "learning_rate": 2.6347339603777236e-05,
+      "loss": 0.505,
+      "step": 425
+    },
+    {
+      "epoch": 0.8082706766917294,
+      "grad_norm": 1.170729160308838,
+      "learning_rate": 2.624508550504874e-05,
+      "loss": 0.4839,
+      "step": 430
+    },
+    {
+      "epoch": 0.8176691729323309,
+      "grad_norm": 1.1502288579940796,
+      "learning_rate": 2.614162414794198e-05,
+      "loss": 0.4501,
+      "step": 435
+    },
+    {
+      "epoch": 0.8270676691729323,
+      "grad_norm": 1.3801743984222412,
+      "learning_rate": 2.6036966639939438e-05,
+      "loss": 0.4682,
+      "step": 440
+    },
+    {
+      "epoch": 0.8364661654135338,
+      "grad_norm": 1.235448956489563,
+      "learning_rate": 2.5931124216940854e-05,
+      "loss": 0.4795,
+      "step": 445
+    },
+    {
+      "epoch": 0.8458646616541353,
+      "grad_norm": 1.4729183912277222,
+      "learning_rate": 2.5824108242056976e-05,
+      "loss": 0.4714,
+      "step": 450
+    },
+    {
+      "epoch": 0.8552631578947368,
+      "grad_norm": 1.2395603656768799,
+      "learning_rate": 2.5715930204389617e-05,
+      "loss": 0.4653,
+      "step": 455
+    },
+    {
+      "epoch": 0.8646616541353384,
+      "grad_norm": 1.1458605527877808,
+      "learning_rate": 2.5606601717798212e-05,
+      "loss": 0.4444,
+      "step": 460
+    },
+    {
+      "epoch": 0.8740601503759399,
+      "grad_norm": 1.2719547748565674,
+      "learning_rate": 2.549613451965295e-05,
+      "loss": 0.4349,
+      "step": 465
+    },
+    {
+      "epoch": 0.8834586466165414,
+      "grad_norm": 1.2125513553619385,
+      "learning_rate": 2.538454046957468e-05,
+      "loss": 0.4706,
+      "step": 470
+    },
+    {
+      "epoch": 0.8928571428571429,
+      "grad_norm": 1.3109755516052246,
+      "learning_rate": 2.5271831548161667e-05,
+      "loss": 0.4087,
+      "step": 475
+    },
+    {
+      "epoch": 0.9022556390977443,
+      "grad_norm": 1.2260974645614624,
+      "learning_rate": 2.5158019855703377e-05,
+      "loss": 0.4173,
+      "step": 480
+    },
+    {
+      "epoch": 0.9116541353383458,
+      "grad_norm": 1.4752731323242188,
+      "learning_rate": 2.5043117610881402e-05,
+      "loss": 0.4099,
+      "step": 485
+    },
+    {
+      "epoch": 0.9210526315789473,
+      "grad_norm": 1.258771300315857,
+      "learning_rate": 2.4927137149457685e-05,
+      "loss": 0.412,
+      "step": 490
+    },
+    {
+      "epoch": 0.9304511278195489,
+      "grad_norm": 1.2981860637664795,
+      "learning_rate": 2.4810090922950143e-05,
+      "loss": 0.406,
+      "step": 495
+    },
+    {
+      "epoch": 0.9398496240601504,
+      "grad_norm": 1.2472333908081055,
+      "learning_rate": 2.4691991497295922e-05,
+      "loss": 0.4052,
+      "step": 500
+    },
+    {
+      "epoch": 0.9492481203007519,
+      "grad_norm": 1.4154011011123657,
+      "learning_rate": 2.457285155150231e-05,
+      "loss": 0.3895,
+      "step": 505
+    },
+    {
+      "epoch": 0.9586466165413534,
+      "grad_norm": 1.2483034133911133,
+      "learning_rate": 2.4452683876285546e-05,
+      "loss": 0.4271,
+      "step": 510
+    },
+    {
+      "epoch": 0.9680451127819549,
+      "grad_norm": 1.471336007118225,
+      "learning_rate": 2.433150137269762e-05,
+      "loss": 0.4334,
+      "step": 515
+    },
+    {
+      "epoch": 0.9774436090225563,
+      "grad_norm": 1.2480738162994385,
+      "learning_rate": 2.420931705074122e-05,
+      "loss": 0.3566,
+      "step": 520
+    },
+    {
+      "epoch": 0.9868421052631579,
+      "grad_norm": 1.4982995986938477,
+      "learning_rate": 2.408614402797302e-05,
+      "loss": 0.4002,
+      "step": 525
+    },
+    {
+      "epoch": 0.9962406015037594,
+      "grad_norm": 1.1962337493896484,
+      "learning_rate": 2.3961995528095385e-05,
+      "loss": 0.3889,
+      "step": 530
+    },
+    {
+      "epoch": 1.005639097744361,
+      "grad_norm": 1.140385389328003,
+      "learning_rate": 2.3836884879536676e-05,
+      "loss": 0.3202,
+      "step": 535
+    },
+    {
+      "epoch": 1.0150375939849625,
+      "grad_norm": 1.1931862831115723,
+      "learning_rate": 2.371082551402034e-05,
+      "loss": 0.3586,
+      "step": 540
+    },
+    {
+      "epoch": 1.0244360902255638,
+      "grad_norm": 1.0671947002410889,
+      "learning_rate": 2.3583830965122904e-05,
+      "loss": 0.3112,
+      "step": 545
+    },
+    {
+      "epoch": 1.0338345864661653,
+      "grad_norm": 1.3327282667160034,
+      "learning_rate": 2.345591486682101e-05,
+      "loss": 0.3376,
+      "step": 550
+    },
+    {
+      "epoch": 1.0432330827067668,
+      "grad_norm": 1.3870916366577148,
+      "learning_rate": 2.3327090952027704e-05,
+      "loss": 0.3209,
+      "step": 555
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.2349412441253662,
+      "learning_rate": 2.3197373051118074e-05,
+      "loss": 0.2813,
+      "step": 560
+    },
+    {
+      "epoch": 1.0620300751879699,
+      "grad_norm": 1.1286367177963257,
+      "learning_rate": 2.3066775090444446e-05,
+      "loss": 0.2833,
+      "step": 565
+    },
+    {
+      "epoch": 1.0714285714285714,
+      "grad_norm": 1.31594979763031,
+      "learning_rate": 2.2935311090841265e-05,
+      "loss": 0.295,
+      "step": 570
+    },
+    {
+      "epoch": 1.080827067669173,
+      "grad_norm": 1.274774432182312,
+      "learning_rate": 2.2802995166119846e-05,
+      "loss": 0.2885,
+      "step": 575
+    },
+    {
+      "epoch": 1.0902255639097744,
+      "grad_norm": 1.1666953563690186,
+      "learning_rate": 2.266984152155311e-05,
+      "loss": 0.34,
+      "step": 580
+    },
+    {
+      "epoch": 1.099624060150376,
+      "grad_norm": 1.193224310874939,
+      "learning_rate": 2.2535864452350543e-05,
+      "loss": 0.2888,
+      "step": 585
+    },
+    {
+      "epoch": 1.1090225563909775,
+      "grad_norm": 1.3207679986953735,
+      "learning_rate": 2.2401078342123483e-05,
+      "loss": 0.2788,
+      "step": 590
+    },
+    {
+      "epoch": 1.118421052631579,
+      "grad_norm": 1.2950669527053833,
+      "learning_rate": 2.2265497661340893e-05,
+      "loss": 0.3238,
+      "step": 595
+    },
+    {
+      "epoch": 1.1278195488721805,
+      "grad_norm": 1.1799949407577515,
+      "learning_rate": 2.212913696577585e-05,
+      "loss": 0.303,
+      "step": 600
+    },
+    {
+      "epoch": 1.137218045112782,
+      "grad_norm": 1.327563762664795,
+      "learning_rate": 2.1992010894942845e-05,
+      "loss": 0.2545,
+      "step": 605
+    },
+    {
+      "epoch": 1.1466165413533835,
+      "grad_norm": 1.3011080026626587,
+      "learning_rate": 2.1854134170526094e-05,
+      "loss": 0.2662,
+      "step": 610
+    },
+    {
+      "epoch": 1.156015037593985,
+      "grad_norm": 1.568489909172058,
+      "learning_rate": 2.1715521594799065e-05,
+      "loss": 0.2689,
+      "step": 615
+    },
+    {
+      "epoch": 1.1654135338345863,
+      "grad_norm": 1.2697765827178955,
+      "learning_rate": 2.15761880490353e-05,
+      "loss": 0.2719,
+      "step": 620
+    },
+    {
+      "epoch": 1.1748120300751879,
+      "grad_norm": 1.260827898979187,
+      "learning_rate": 2.1436148491910773e-05,
+      "loss": 0.2451,
+      "step": 625
+    },
+    {
+      "epoch": 1.1842105263157894,
+      "grad_norm": 1.2346687316894531,
+      "learning_rate": 2.1295417957897982e-05,
+      "loss": 0.2639,
+      "step": 630
+    },
+    {
+      "epoch": 1.193609022556391,
+      "grad_norm": 1.3587771654129028,
+      "learning_rate": 2.1154011555651822e-05,
+      "loss": 0.2428,
+      "step": 635
+    },
+    {
+      "epoch": 1.2030075187969924,
+      "grad_norm": 1.3083690404891968,
+      "learning_rate": 2.1011944466387577e-05,
+      "loss": 0.2329,
+      "step": 640
+    },
+    {
+      "epoch": 1.212406015037594,
+      "grad_norm": 1.4359098672866821,
+      "learning_rate": 2.086923194225105e-05,
+      "loss": 0.2309,
+      "step": 645
+    },
+    {
+      "epoch": 1.2218045112781954,
+      "grad_norm": 1.353179931640625,
+      "learning_rate": 2.0725889304681143e-05,
+      "loss": 0.2491,
+      "step": 650
+    },
+    {
+      "epoch": 1.231203007518797,
+      "grad_norm": 1.1572445631027222,
+      "learning_rate": 2.0581931942764945e-05,
+      "loss": 0.2541,
+      "step": 655
+    },
+    {
+      "epoch": 1.2406015037593985,
+      "grad_norm": 1.3108141422271729,
+      "learning_rate": 2.043737531158559e-05,
+      "loss": 0.2584,
+      "step": 660
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.0516366958618164,
+      "learning_rate": 2.0292234930563e-05,
+      "loss": 0.262,
+      "step": 665
+    },
+    {
+      "epoch": 1.2593984962406015,
+      "grad_norm": 1.211906909942627,
+      "learning_rate": 2.014652638178777e-05,
+      "loss": 0.226,
+      "step": 670
+    },
+    {
+      "epoch": 1.268796992481203,
+      "grad_norm": 1.2837917804718018,
+      "learning_rate": 2.0000265308348273e-05,
+      "loss": 0.2544,
+      "step": 675
+    },
+    {
+      "epoch": 1.2781954887218046,
+      "grad_norm": 1.1363428831100464,
+      "learning_rate": 1.9853467412651235e-05,
+      "loss": 0.2386,
+      "step": 680
+    },
+    {
+      "epoch": 1.287593984962406,
+      "grad_norm": 1.3048768043518066,
+      "learning_rate": 1.970614845473596e-05,
+      "loss": 0.2479,
+      "step": 685
+    },
+    {
+      "epoch": 1.2969924812030076,
+      "grad_norm": 1.232675313949585,
+      "learning_rate": 1.9558324250582335e-05,
+      "loss": 0.237,
+      "step": 690
+    },
+    {
+      "epoch": 1.306390977443609,
+      "grad_norm": 1.1638017892837524,
+      "learning_rate": 1.941001067041286e-05,
+      "loss": 0.2307,
+      "step": 695
+    },
+    {
+      "epoch": 1.3157894736842106,
+      "grad_norm": 1.0934059619903564,
+      "learning_rate": 1.926122363698883e-05,
+      "loss": 0.2282,
+      "step": 700
+    },
+    {
+      "epoch": 1.3251879699248121,
+      "grad_norm": 1.7193068265914917,
+      "learning_rate": 1.91119791239009e-05,
+      "loss": 0.2133,
+      "step": 705
+    },
+    {
+      "epoch": 1.3345864661654137,
+      "grad_norm": 1.1320679187774658,
+      "learning_rate": 1.8962293153854164e-05,
+      "loss": 0.2297,
+      "step": 710
+    },
+    {
+      "epoch": 1.3439849624060152,
+      "grad_norm": 1.2152717113494873,
+      "learning_rate": 1.8812181796947988e-05,
+      "loss": 0.2106,
+      "step": 715
+    },
+    {
+      "epoch": 1.3533834586466165,
+      "grad_norm": 1.231295108795166,
+      "learning_rate": 1.8661661168950745e-05,
+      "loss": 0.2395,
+      "step": 720
+    },
+    {
+      "epoch": 1.362781954887218,
+      "grad_norm": 1.0955288410186768,
+      "learning_rate": 1.8510747429569633e-05,
+      "loss": 0.2355,
+      "step": 725
+    },
+    {
+      "epoch": 1.3721804511278195,
+      "grad_norm": 1.0871318578720093,
+      "learning_rate": 1.835945678071581e-05,
+      "loss": 0.2428,
+      "step": 730
+    },
+    {
+      "epoch": 1.381578947368421,
+      "grad_norm": 1.29008150100708,
+      "learning_rate": 1.8207805464764958e-05,
+      "loss": 0.2168,
+      "step": 735
+    },
+    {
+      "epoch": 1.3909774436090225,
+      "grad_norm": 1.2296003103256226,
+      "learning_rate": 1.8055809762813537e-05,
+      "loss": 0.2093,
+      "step": 740
+    },
+    {
+      "epoch": 1.400375939849624,
+      "grad_norm": 1.2095301151275635,
+      "learning_rate": 1.790348599293085e-05,
+      "loss": 0.1981,
+      "step": 745
+    },
+    {
+      "epoch": 1.4097744360902256,
+      "grad_norm": 1.045202374458313,
+      "learning_rate": 1.7750850508407172e-05,
+      "loss": 0.2143,
+      "step": 750
+    },
+    {
+      "epoch": 1.419172932330827,
+      "grad_norm": 1.2133156061172485,
+      "learning_rate": 1.7597919695998067e-05,
+      "loss": 0.1857,
+      "step": 755
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 1.293715476989746,
+      "learning_rate": 1.7444709974165143e-05,
+      "loss": 0.2074,
+      "step": 760
+    },
+    {
+      "epoch": 1.4379699248120301,
+      "grad_norm": 1.0929676294326782,
+      "learning_rate": 1.729123779131336e-05,
+      "loss": 0.1796,
+      "step": 765
+    },
+    {
+      "epoch": 1.4473684210526316,
+      "grad_norm": 1.2607976198196411,
+      "learning_rate": 1.7137519624025193e-05,
+      "loss": 0.1813,
+      "step": 770
+    },
+    {
+      "epoch": 1.4567669172932332,
+      "grad_norm": 1.0812962055206299,
+      "learning_rate": 1.6983571975291667e-05,
+      "loss": 0.1881,
+      "step": 775
+    },
+    {
+      "epoch": 1.4661654135338344,
+      "grad_norm": 1.092307448387146,
+      "learning_rate": 1.682941137274068e-05,
+      "loss": 0.1872,
+      "step": 780
+    },
+    {
+      "epoch": 1.475563909774436,
+      "grad_norm": 1.804694414138794,
+      "learning_rate": 1.6675054366862553e-05,
+      "loss": 0.2035,
+      "step": 785
+    },
+    {
+      "epoch": 1.4849624060150375,
+      "grad_norm": 1.174047827720642,
+      "learning_rate": 1.6520517529233265e-05,
+      "loss": 0.1928,
+      "step": 790
+    },
+    {
+      "epoch": 1.494360902255639,
+      "grad_norm": 1.1136754751205444,
+      "learning_rate": 1.6365817450735273e-05,
+      "loss": 0.1903,
+      "step": 795
+    },
+    {
+      "epoch": 1.5037593984962405,
+      "grad_norm": 1.1049115657806396,
+      "learning_rate": 1.6210970739776386e-05,
+      "loss": 0.1856,
+      "step": 800
+    },
+    {
+      "epoch": 1.513157894736842,
+      "grad_norm": 1.2019330263137817,
+      "learning_rate": 1.605599402050669e-05,
+      "loss": 0.1805,
+      "step": 805
+    },
+    {
+      "epoch": 1.5225563909774436,
+      "grad_norm": 1.2262581586837769,
+      "learning_rate": 1.5900903931033795e-05,
+      "loss": 0.1851,
+      "step": 810
+    },
+    {
+      "epoch": 1.531954887218045,
+      "grad_norm": 1.0285981893539429,
+      "learning_rate": 1.574571712163661e-05,
+      "loss": 0.2051,
+      "step": 815
+    },
+    {
+      "epoch": 1.5413533834586466,
+      "grad_norm": 1.1294852495193481,
+      "learning_rate": 1.559045025297775e-05,
+      "loss": 0.1817,
+      "step": 820
+    },
+    {
+      "epoch": 1.550751879699248,
+      "grad_norm": 1.038309097290039,
+      "learning_rate": 1.5435119994314924e-05,
+      "loss": 0.1748,
+      "step": 825
+    },
+    {
+      "epoch": 1.5601503759398496,
+      "grad_norm": 1.0437440872192383,
+      "learning_rate": 1.5279743021711284e-05,
+      "loss": 0.1768,
+      "step": 830
+    },
+    {
+      "epoch": 1.5695488721804511,
+      "grad_norm": 1.1250826120376587,
+      "learning_rate": 1.5124336016245149e-05,
+      "loss": 0.1727,
+      "step": 835
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 1.0569233894348145,
+      "learning_rate": 1.4968915662219128e-05,
+      "loss": 0.1636,
+      "step": 840
+    },
+    {
+      "epoch": 1.5883458646616542,
+      "grad_norm": 1.0236541032791138,
+      "learning_rate": 1.4813498645368903e-05,
+      "loss": 0.1836,
+      "step": 845
+    },
+    {
+      "epoch": 1.5977443609022557,
+      "grad_norm": 1.0962340831756592,
+      "learning_rate": 1.4658101651071892e-05,
+      "loss": 0.1648,
+      "step": 850
+    },
+    {
+      "epoch": 1.6071428571428572,
+      "grad_norm": 1.1532505750656128,
+      "learning_rate": 1.4502741362555917e-05,
+      "loss": 0.1644,
+      "step": 855
+    },
+    {
+      "epoch": 1.6165413533834587,
+      "grad_norm": 1.4584366083145142,
+      "learning_rate": 1.4347434459108116e-05,
+      "loss": 0.1696,
+      "step": 860
+    },
+    {
+      "epoch": 1.6259398496240602,
+      "grad_norm": 1.0582867860794067,
+      "learning_rate": 1.4192197614284248e-05,
+      "loss": 0.1779,
+      "step": 865
+    },
+    {
+      "epoch": 1.6353383458646618,
+      "grad_norm": 1.1785125732421875,
+      "learning_rate": 1.4037047494118694e-05,
+      "loss": 0.1793,
+      "step": 870
+    },
+    {
+      "epoch": 1.6447368421052633,
+      "grad_norm": 1.038996696472168,
+      "learning_rate": 1.388200075533518e-05,
+      "loss": 0.1548,
+      "step": 875
+    },
+    {
+      "epoch": 1.6541353383458648,
+      "grad_norm": 1.4645986557006836,
+      "learning_rate": 1.3727074043558517e-05,
+      "loss": 0.1695,
+      "step": 880
+    },
+    {
+      "epoch": 1.6635338345864663,
+      "grad_norm": 1.1247239112854004,
+      "learning_rate": 1.3572283991527582e-05,
+      "loss": 0.1708,
+      "step": 885
+    },
+    {
+      "epoch": 1.6729323308270678,
+      "grad_norm": 1.0306785106658936,
+      "learning_rate": 1.3417647217309632e-05,
+      "loss": 0.1476,
+      "step": 890
+    },
+    {
+      "epoch": 1.6823308270676691,
+      "grad_norm": 1.0363950729370117,
+      "learning_rate": 1.3263180322516205e-05,
+      "loss": 0.1491,
+      "step": 895
+    },
+    {
+      "epoch": 1.6917293233082706,
+      "grad_norm": 0.8831355571746826,
+      "learning_rate": 1.3108899890520786e-05,
+      "loss": 0.1467,
+      "step": 900
+    },
+    {
+      "epoch": 1.7011278195488722,
+      "grad_norm": 1.2344099283218384,
+      "learning_rate": 1.295482248467846e-05,
+      "loss": 0.1594,
+      "step": 905
+    },
+    {
+      "epoch": 1.7105263157894737,
+      "grad_norm": 1.039624810218811,
+      "learning_rate": 1.2800964646547674e-05,
+      "loss": 0.1437,
+      "step": 910
+    },
+    {
+      "epoch": 1.7199248120300752,
+      "grad_norm": 0.8950397372245789,
+      "learning_rate": 1.2647342894114357e-05,
+      "loss": 0.1331,
+      "step": 915
+    },
+    {
+      "epoch": 1.7293233082706767,
+      "grad_norm": 1.0810532569885254,
+      "learning_rate": 1.2493973720018577e-05,
+      "loss": 0.157,
+      "step": 920
+    },
+    {
+      "epoch": 1.7387218045112782,
+      "grad_norm": 1.3350157737731934,
+      "learning_rate": 1.2340873589783888e-05,
+      "loss": 0.1443,
+      "step": 925
+    },
+    {
+      "epoch": 1.7481203007518797,
+      "grad_norm": 1.0185054540634155,
+      "learning_rate": 1.2188058940049651e-05,
+      "loss": 0.1345,
+      "step": 930
+    },
+    {
+      "epoch": 1.7575187969924813,
+      "grad_norm": 1.1121907234191895,
+      "learning_rate": 1.2035546176806386e-05,
+      "loss": 0.1129,
+      "step": 935
+    },
+    {
+      "epoch": 1.7669172932330826,
+      "grad_norm": 1.2006617784500122,
+      "learning_rate": 1.1883351673634457e-05,
+      "loss": 0.1364,
+      "step": 940
+    },
+    {
+      "epoch": 1.776315789473684,
+      "grad_norm": 1.0103201866149902,
+      "learning_rate": 1.1731491769946225e-05,
+      "loss": 0.1416,
+      "step": 945
+    },
+    {
+      "epoch": 1.7857142857142856,
+      "grad_norm": 1.0189690589904785,
+      "learning_rate": 1.157998276923187e-05,
+      "loss": 0.1431,
+      "step": 950
+    },
+    {
+      "epoch": 1.795112781954887,
+      "grad_norm": 1.2393074035644531,
+      "learning_rate": 1.1428840937309047e-05,
+      "loss": 0.1282,
+      "step": 955
+    },
+    {
+      "epoch": 1.8045112781954886,
+      "grad_norm": 1.021828532218933,
+      "learning_rate": 1.127808250057665e-05,
+      "loss": 0.1403,
+      "step": 960
+    },
+    {
+      "epoch": 1.8139097744360901,
+      "grad_norm": 1.1605874300003052,
+      "learning_rate": 1.1127723644272728e-05,
+      "loss": 0.1423,
+      "step": 965
+    },
+    {
+      "epoch": 1.8233082706766917,
+      "grad_norm": 0.9515864253044128,
+      "learning_rate": 1.0977780510736885e-05,
+      "loss": 0.1321,
+      "step": 970
+    },
+    {
+      "epoch": 1.8327067669172932,
+      "grad_norm": 1.0644917488098145,
+      "learning_rate": 1.0828269197677237e-05,
+      "loss": 0.1259,
+      "step": 975
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 1.02317214012146,
+      "learning_rate": 1.067920575644219e-05,
+      "loss": 0.1383,
+      "step": 980
+    },
+    {
+      "epoch": 1.8515037593984962,
+      "grad_norm": 1.0989339351654053,
+      "learning_rate": 1.0530606190297192e-05,
+      "loss": 0.1262,
+      "step": 985
+    },
+    {
+      "epoch": 1.8609022556390977,
+      "grad_norm": 0.9936410784721375,
+      "learning_rate": 1.0382486452706643e-05,
+      "loss": 0.1206,
+      "step": 990
+    },
+    {
+      "epoch": 1.8703007518796992,
+      "grad_norm": 0.9912272095680237,
+      "learning_rate": 1.0234862445621142e-05,
+      "loss": 0.1201,
+      "step": 995
+    },
+    {
+      "epoch": 1.8796992481203008,
+      "grad_norm": 0.9345570206642151,
+      "learning_rate": 1.0087750017770274e-05,
+      "loss": 0.1234,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8890977443609023,
+      "grad_norm": 1.1206716299057007,
+      "learning_rate": 9.941164962961124e-06,
+      "loss": 0.1219,
+      "step": 1005
+    },
+    {
+      "epoch": 1.8984962406015038,
+      "grad_norm": 0.9936740398406982,
+      "learning_rate": 9.795123018382661e-06,
+      "loss": 0.1243,
+      "step": 1010
+    },
+    {
+      "epoch": 1.9078947368421053,
+      "grad_norm": 0.935027003288269,
+      "learning_rate": 9.649639862916213e-06,
+      "loss": 0.1192,
+      "step": 1015
+    },
+    {
+      "epoch": 1.9172932330827068,
+      "grad_norm": 0.939355194568634,
+      "learning_rate": 9.5047311154522e-06,
+      "loss": 0.1202,
+      "step": 1020
+    },
+    {
+      "epoch": 1.9266917293233083,
+      "grad_norm": 0.952413022518158,
+      "learning_rate": 9.360412333213324e-06,
+      "loss": 0.1108,
+      "step": 1025
+    },
+    {
+      "epoch": 1.9360902255639099,
+      "grad_norm": 0.9872532486915588,
+      "learning_rate": 9.216699010084356e-06,
+      "loss": 0.1135,
+      "step": 1030
+    },
+    {
+      "epoch": 1.9454887218045114,
+      "grad_norm": 0.8481807112693787,
+      "learning_rate": 9.073606574948716e-06,
+      "loss": 0.1151,
+      "step": 1035
+    },
+    {
+      "epoch": 1.954887218045113,
+      "grad_norm": 1.110603928565979,
+      "learning_rate": 8.931150390032087e-06,
+      "loss": 0.1186,
+      "step": 1040
+    },
+    {
+      "epoch": 1.9642857142857144,
+      "grad_norm": 1.1130671501159668,
+      "learning_rate": 8.789345749253089e-06,
+      "loss": 0.1239,
+      "step": 1045
+    },
+    {
+      "epoch": 1.973684210526316,
+      "grad_norm": 0.9827004075050354,
+      "learning_rate": 8.648207876581394e-06,
+      "loss": 0.112,
+      "step": 1050
+    },
+    {
+      "epoch": 1.9830827067669174,
+      "grad_norm": 1.004602074623108,
+      "learning_rate": 8.50775192440329e-06,
+      "loss": 0.1095,
+      "step": 1055
+    },
+    {
+      "epoch": 1.9924812030075187,
+      "grad_norm": 0.9740102291107178,
+      "learning_rate": 8.367992971894906e-06,
+      "loss": 0.107,
+      "step": 1060
+    },
+    {
+      "epoch": 2.0018796992481205,
+      "grad_norm": 0.6189488172531128,
+      "learning_rate": 8.228946023403364e-06,
+      "loss": 0.1023,
+      "step": 1065
+    },
+    {
+      "epoch": 2.011278195488722,
+      "grad_norm": 1.0207630395889282,
+      "learning_rate": 8.09062600683593e-06,
+      "loss": 0.0881,
+      "step": 1070
+    },
+    {
+      "epoch": 2.0206766917293235,
+      "grad_norm": 0.904458224773407,
+      "learning_rate": 7.95304777205736e-06,
+      "loss": 0.0882,
+      "step": 1075
+    },
+    {
+      "epoch": 2.030075187969925,
+      "grad_norm": 0.9616421461105347,
+      "learning_rate": 7.816226089295627e-06,
+      "loss": 0.091,
+      "step": 1080
+    },
+    {
+      "epoch": 2.039473684210526,
+      "grad_norm": 0.6990841627120972,
+      "learning_rate": 7.680175647556236e-06,
+      "loss": 0.0814,
+      "step": 1085
+    },
+    {
+      "epoch": 2.0488721804511276,
+      "grad_norm": 0.7550905346870422,
+      "learning_rate": 7.54491105304521e-06,
+      "loss": 0.0908,
+      "step": 1090
+    },
+    {
+      "epoch": 2.058270676691729,
+      "grad_norm": 0.8111321330070496,
+      "learning_rate": 7.4104468276009995e-06,
+      "loss": 0.0878,
+      "step": 1095
+    },
+    {
+      "epoch": 2.0676691729323307,
+      "grad_norm": 0.848985493183136,
+      "learning_rate": 7.276797407135425e-06,
+      "loss": 0.0927,
+      "step": 1100
+    },
+    {
+      "epoch": 2.077067669172932,
+      "grad_norm": 0.8092796802520752,
+      "learning_rate": 7.143977140083848e-06,
+      "loss": 0.0906,
+      "step": 1105
+    },
+    {
+      "epoch": 2.0864661654135337,
+      "grad_norm": 0.7931380867958069,
+      "learning_rate": 7.012000285864764e-06,
+      "loss": 0.0883,
+      "step": 1110
+    },
+    {
+      "epoch": 2.095864661654135,
+      "grad_norm": 0.7709981203079224,
+      "learning_rate": 6.880881013348917e-06,
+      "loss": 0.0732,
+      "step": 1115
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 0.7526527047157288,
+      "learning_rate": 6.750633399338142e-06,
+      "loss": 0.0803,
+      "step": 1120
+    },
+    {
+      "epoch": 2.1146616541353382,
+      "grad_norm": 0.7257946729660034,
+      "learning_rate": 6.621271427054106e-06,
+      "loss": 0.0762,
+      "step": 1125
+    },
+    {
+      "epoch": 2.1240601503759398,
+      "grad_norm": 0.9319130778312683,
+      "learning_rate": 6.492808984637086e-06,
+      "loss": 0.0914,
+      "step": 1130
+    },
+    {
+      "epoch": 2.1334586466165413,
+      "grad_norm": 0.8819976449012756,
+      "learning_rate": 6.3652598636549365e-06,
+      "loss": 0.0773,
+      "step": 1135
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.7389273047447205,
+      "learning_rate": 6.238637757622476e-06,
+      "loss": 0.089,
+      "step": 1140
+    },
+    {
+      "epoch": 2.1522556390977443,
+      "grad_norm": 0.6320111155509949,
+      "learning_rate": 6.112956260531351e-06,
+      "loss": 0.0791,
+      "step": 1145
+    },
+    {
+      "epoch": 2.161654135338346,
+      "grad_norm": 0.6096066832542419,
+      "learning_rate": 5.988228865390596e-06,
+      "loss": 0.0769,
+      "step": 1150
+    },
+    {
+      "epoch": 2.1710526315789473,
+      "grad_norm": 0.7631605267524719,
+      "learning_rate": 5.864468962778054e-06,
+      "loss": 0.0743,
+      "step": 1155
+    },
+    {
+      "epoch": 2.180451127819549,
+      "grad_norm": 0.823680579662323,
+      "learning_rate": 5.7416898394027766e-06,
+      "loss": 0.0777,
+      "step": 1160
+    },
+    {
+      "epoch": 2.1898496240601504,
+      "grad_norm": 0.9240273237228394,
+      "learning_rate": 5.619904676678565e-06,
+      "loss": 0.0817,
+      "step": 1165
+    },
+    {
+      "epoch": 2.199248120300752,
+      "grad_norm": 0.6772187352180481,
+      "learning_rate": 5.4991265493088545e-06,
+      "loss": 0.0745,
+      "step": 1170
+    },
+    {
+      "epoch": 2.2086466165413534,
+      "grad_norm": 0.8896387219429016,
+      "learning_rate": 5.3793684238830175e-06,
+      "loss": 0.0874,
+      "step": 1175
+    },
+    {
+      "epoch": 2.218045112781955,
+      "grad_norm": 0.6756446361541748,
+      "learning_rate": 5.260643157484284e-06,
+      "loss": 0.0727,
+      "step": 1180
+    },
+    {
+      "epoch": 2.2274436090225564,
+      "grad_norm": 0.8986202478408813,
+      "learning_rate": 5.142963496309428e-06,
+      "loss": 0.0738,
+      "step": 1185
+    },
+    {
+      "epoch": 2.236842105263158,
+      "grad_norm": 0.6743974685668945,
+      "learning_rate": 5.02634207430035e-06,
+      "loss": 0.0848,
+      "step": 1190
+    },
+    {
+      "epoch": 2.2462406015037595,
+      "grad_norm": 0.7321045994758606,
+      "learning_rate": 4.9107914117877e-06,
+      "loss": 0.0757,
+      "step": 1195
+    },
+    {
+      "epoch": 2.255639097744361,
+      "grad_norm": 0.7953357100486755,
+      "learning_rate": 4.796323914146734e-06,
+      "loss": 0.081,
+      "step": 1200
+    },
+    {
+      "epoch": 2.2650375939849625,
+      "grad_norm": 0.7724120020866394,
+      "learning_rate": 4.682951870465474e-06,
+      "loss": 0.0826,
+      "step": 1205
+    },
+    {
+      "epoch": 2.274436090225564,
+      "grad_norm": 0.7581201195716858,
+      "learning_rate": 4.570687452225367e-06,
+      "loss": 0.0755,
+      "step": 1210
+    },
+    {
+      "epoch": 2.2838345864661656,
+      "grad_norm": 0.8605563640594482,
+      "learning_rate": 4.459542711994568e-06,
+      "loss": 0.0849,
+      "step": 1215
+    },
+    {
+      "epoch": 2.293233082706767,
+      "grad_norm": 0.748431384563446,
+      "learning_rate": 4.349529582134008e-06,
+      "loss": 0.076,
+      "step": 1220
+    },
+    {
+      "epoch": 2.3026315789473686,
+      "grad_norm": 0.7182886004447937,
+      "learning_rate": 4.240659873516319e-06,
+      "loss": 0.0833,
+      "step": 1225
+    },
+    {
+      "epoch": 2.31203007518797,
+      "grad_norm": 0.6337144374847412,
+      "learning_rate": 4.132945274257862e-06,
+      "loss": 0.0807,
+      "step": 1230
+    },
+    {
+      "epoch": 2.3214285714285716,
+      "grad_norm": 0.7233930230140686,
+      "learning_rate": 4.026397348463898e-06,
+      "loss": 0.0815,
+      "step": 1235
+    },
+    {
+      "epoch": 2.3308270676691727,
+      "grad_norm": 0.7005705237388611,
+      "learning_rate": 3.921027534987075e-06,
+      "loss": 0.0668,
+      "step": 1240
+    },
+    {
+      "epoch": 2.340225563909774,
+      "grad_norm": 0.7019994854927063,
+      "learning_rate": 3.816847146199372e-06,
+      "loss": 0.0742,
+      "step": 1245
+    },
+    {
+      "epoch": 2.3496240601503757,
+      "grad_norm": 0.6627863049507141,
+      "learning_rate": 3.7138673667776056e-06,
+      "loss": 0.0714,
+      "step": 1250
+    },
+    {
+      "epoch": 2.3590225563909772,
+      "grad_norm": 0.799941897392273,
+      "learning_rate": 3.612099252502672e-06,
+      "loss": 0.082,
+      "step": 1255
+    },
+    {
+      "epoch": 2.3684210526315788,
+      "grad_norm": 0.6937321424484253,
+      "learning_rate": 3.5115537290726074e-06,
+      "loss": 0.0732,
+      "step": 1260
+    },
+    {
+      "epoch": 2.3778195488721803,
+      "grad_norm": 0.9581036567687988,
+      "learning_rate": 3.4122415909296157e-06,
+      "loss": 0.0814,
+      "step": 1265
+    },
+    {
+      "epoch": 2.387218045112782,
+      "grad_norm": 0.6360145807266235,
+      "learning_rate": 3.3141735001011908e-06,
+      "loss": 0.068,
+      "step": 1270
+    },
+    {
+      "epoch": 2.3966165413533833,
+      "grad_norm": 0.694480836391449,
+      "learning_rate": 3.217359985055462e-06,
+      "loss": 0.0694,
+      "step": 1275
+    },
+    {
+      "epoch": 2.406015037593985,
+      "grad_norm": 0.716654360294342,
+      "learning_rate": 3.1218114395708502e-06,
+      "loss": 0.0733,
+      "step": 1280
+    },
+    {
+      "epoch": 2.4154135338345863,
+      "grad_norm": 0.7194705605506897,
+      "learning_rate": 3.0275381216202334e-06,
+      "loss": 0.0718,
+      "step": 1285
+    },
+    {
+      "epoch": 2.424812030075188,
+      "grad_norm": 0.6594160199165344,
+      "learning_rate": 2.934550152269649e-06,
+      "loss": 0.0707,
+      "step": 1290
+    },
+    {
+      "epoch": 2.4342105263157894,
+      "grad_norm": 0.9902744293212891,
+      "learning_rate": 2.8428575145916946e-06,
+      "loss": 0.0727,
+      "step": 1295
+    },
+    {
+      "epoch": 2.443609022556391,
+      "grad_norm": 0.8495991230010986,
+      "learning_rate": 2.7524700525937884e-06,
+      "loss": 0.0753,
+      "step": 1300
+    },
+    {
+      "epoch": 2.4530075187969924,
+      "grad_norm": 0.6580012440681458,
+      "learning_rate": 2.6633974701613057e-06,
+      "loss": 0.0765,
+      "step": 1305
+    },
+    {
+      "epoch": 2.462406015037594,
+      "grad_norm": 0.7875741720199585,
+      "learning_rate": 2.575649330015794e-06,
+      "loss": 0.0726,
+      "step": 1310
+    },
+    {
+      "epoch": 2.4718045112781954,
+      "grad_norm": 0.7627713680267334,
+      "learning_rate": 2.489235052688314e-06,
+      "loss": 0.0822,
+      "step": 1315
+    },
+    {
+      "epoch": 2.481203007518797,
+      "grad_norm": 0.6470919251441956,
+      "learning_rate": 2.4041639155080854e-06,
+      "loss": 0.0718,
+      "step": 1320
+    },
+    {
+      "epoch": 2.4906015037593985,
+      "grad_norm": 0.6367564797401428,
+      "learning_rate": 2.320445051606474e-06,
+      "loss": 0.0753,
+      "step": 1325
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.6063794493675232,
+      "learning_rate": 2.2380874489364657e-06,
+      "loss": 0.0599,
+      "step": 1330
+    },
+    {
+      "epoch": 2.5093984962406015,
+      "grad_norm": 0.6223397254943848,
+      "learning_rate": 2.157099949307741e-06,
+      "loss": 0.0681,
+      "step": 1335
+    },
+    {
+      "epoch": 2.518796992481203,
+      "grad_norm": 0.640089213848114,
+      "learning_rate": 2.0774912474374147e-06,
+      "loss": 0.074,
+      "step": 1340
+    },
+    {
+      "epoch": 2.5281954887218046,
+      "grad_norm": 0.7845536470413208,
+      "learning_rate": 1.9992698900165984e-06,
+      "loss": 0.0673,
+      "step": 1345
+    },
+    {
+      "epoch": 2.537593984962406,
+      "grad_norm": 0.7514573931694031,
+      "learning_rate": 1.922444274792831e-06,
+      "loss": 0.065,
+      "step": 1350
+    },
+    {
+      "epoch": 2.5469924812030076,
+      "grad_norm": 0.6213982105255127,
+      "learning_rate": 1.8470226496685055e-06,
+      "loss": 0.0628,
+      "step": 1355
+    },
+    {
+      "epoch": 2.556390977443609,
+      "grad_norm": 0.6003556847572327,
+      "learning_rate": 1.773013111815383e-06,
+      "loss": 0.0705,
+      "step": 1360
+    },
+    {
+      "epoch": 2.5657894736842106,
+      "grad_norm": 0.7004033327102661,
+      "learning_rate": 1.7004236068053025e-06,
+      "loss": 0.064,
+      "step": 1365
+    },
+    {
+      "epoch": 2.575187969924812,
+      "grad_norm": 0.6858503818511963,
+      "learning_rate": 1.6292619277571292e-06,
+      "loss": 0.0639,
+      "step": 1370
+    },
+    {
+      "epoch": 2.5845864661654137,
+      "grad_norm": 0.5193300843238831,
+      "learning_rate": 1.559535714500127e-06,
+      "loss": 0.0591,
+      "step": 1375
+    },
+    {
+      "epoch": 2.593984962406015,
+      "grad_norm": 0.599259614944458,
+      "learning_rate": 1.4912524527537307e-06,
+      "loss": 0.0742,
+      "step": 1380
+    },
+    {
+      "epoch": 2.6033834586466167,
+      "grad_norm": 0.6666296124458313,
+      "learning_rate": 1.4244194733239008e-06,
+      "loss": 0.0714,
+      "step": 1385
+    },
+    {
+      "epoch": 2.612781954887218,
+      "grad_norm": 0.6025199294090271,
+      "learning_rate": 1.3590439513160935e-06,
+      "loss": 0.0719,
+      "step": 1390
+    },
+    {
+      "epoch": 2.6221804511278197,
+      "grad_norm": 0.4907478988170624,
+      "learning_rate": 1.295132905364954e-06,
+      "loss": 0.0635,
+      "step": 1395
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.7650317549705505,
+      "learning_rate": 1.2326931968807908e-06,
+      "loss": 0.0706,
+      "step": 1400
+    },
+    {
+      "epoch": 2.6409774436090228,
+      "grad_norm": 0.7375085949897766,
+      "learning_rate": 1.171731529312966e-06,
+      "loss": 0.0786,
+      "step": 1405
+    },
+    {
+      "epoch": 2.6503759398496243,
+      "grad_norm": 0.7329378128051758,
+      "learning_rate": 1.1122544474302048e-06,
+      "loss": 0.0714,
+      "step": 1410
+    },
+    {
+      "epoch": 2.659774436090226,
+      "grad_norm": 0.5412716865539551,
+      "learning_rate": 1.0542683366179613e-06,
+      "loss": 0.0607,
+      "step": 1415
+    },
+    {
+      "epoch": 2.6691729323308273,
+      "grad_norm": 0.7059949636459351,
+      "learning_rate": 9.977794221928905e-07,
+      "loss": 0.0736,
+      "step": 1420
+    },
+    {
+      "epoch": 2.678571428571429,
+      "grad_norm": 0.6849631071090698,
+      "learning_rate": 9.427937687345145e-07,
+      "loss": 0.0645,
+      "step": 1425
+    },
+    {
+      "epoch": 2.6879699248120303,
+      "grad_norm": 0.6165528297424316,
+      "learning_rate": 8.893172794341159e-07,
+      "loss": 0.074,
+      "step": 1430
+    },
+    {
+      "epoch": 2.6973684210526314,
+      "grad_norm": 0.5059214234352112,
+      "learning_rate": 8.373556954610079e-07,
+      "loss": 0.0654,
+      "step": 1435
+    },
+    {
+      "epoch": 2.706766917293233,
+      "grad_norm": 0.6148324608802795,
+      "learning_rate": 7.869145953461443e-07,
+      "loss": 0.0614,
+      "step": 1440
+    },
+    {
+      "epoch": 2.7161654135338344,
+      "grad_norm": 0.7251229882240295,
+      "learning_rate": 7.379993943832269e-07,
+      "loss": 0.0773,
+      "step": 1445
+    },
+    {
+      "epoch": 2.725563909774436,
+      "grad_norm": 0.6811131834983826,
+      "learning_rate": 6.90615344047329e-07,
+      "loss": 0.0742,
+      "step": 1450
+    },
+    {
+      "epoch": 2.7349624060150375,
+      "grad_norm": 0.6007992029190063,
+      "learning_rate": 6.447675314310941e-07,
+      "loss": 0.072,
+      "step": 1455
+    },
+    {
+      "epoch": 2.744360902255639,
+      "grad_norm": 0.6286112070083618,
+      "learning_rate": 6.004608786985955e-07,
+      "loss": 0.0744,
+      "step": 1460
+    },
+    {
+      "epoch": 2.7537593984962405,
+      "grad_norm": 0.5636522769927979,
+      "learning_rate": 5.577001425569061e-07,
+      "loss": 0.0603,
+      "step": 1465
+    },
+    {
+      "epoch": 2.763157894736842,
+      "grad_norm": 0.5101714134216309,
+      "learning_rate": 5.164899137454149e-07,
+      "loss": 0.062,
+      "step": 1470
+    },
+    {
+      "epoch": 2.7725563909774436,
+      "grad_norm": 0.6965596079826355,
+      "learning_rate": 4.768346165429749e-07,
+      "loss": 0.071,
+      "step": 1475
+    },
+    {
+      "epoch": 2.781954887218045,
+      "grad_norm": 0.6466209292411804,
+      "learning_rate": 4.387385082929174e-07,
+      "loss": 0.0635,
+      "step": 1480
+    },
+    {
+      "epoch": 2.7913533834586466,
+      "grad_norm": 0.5297590494155884,
+      "learning_rate": 4.022056789459921e-07,
+      "loss": 0.059,
+      "step": 1485
+    },
+    {
+      "epoch": 2.800751879699248,
+      "grad_norm": 0.5080699920654297,
+      "learning_rate": 3.6724005062126534e-07,
+      "loss": 0.0644,
+      "step": 1490
+    },
+    {
+      "epoch": 2.8101503759398496,
+      "grad_norm": 0.6745559573173523,
+      "learning_rate": 3.3384537718506135e-07,
+      "loss": 0.0687,
+      "step": 1495
+    },
+    {
+      "epoch": 2.819548872180451,
+      "grad_norm": 0.510647714138031,
+      "learning_rate": 3.0202524384793706e-07,
+      "loss": 0.0675,
+      "step": 1500
+    },
+    {
+      "epoch": 2.8289473684210527,
+      "grad_norm": 0.5886790156364441,
+      "learning_rate": 2.717830667797877e-07,
+      "loss": 0.0607,
+      "step": 1505
+    },
+    {
+      "epoch": 2.838345864661654,
+      "grad_norm": 0.537045419216156,
+      "learning_rate": 2.431220927430905e-07,
+      "loss": 0.0558,
+      "step": 1510
+    },
+    {
+      "epoch": 2.8477443609022557,
+      "grad_norm": 0.6089495420455933,
+      "learning_rate": 2.1604539874433006e-07,
+      "loss": 0.0585,
+      "step": 1515
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.7006784677505493,
+      "learning_rate": 1.9055589170365762e-07,
+      "loss": 0.0737,
+      "step": 1520
+    },
+    {
+      "epoch": 2.8665413533834587,
+      "grad_norm": 0.611802339553833,
+      "learning_rate": 1.6665630814281575e-07,
+      "loss": 0.0683,
+      "step": 1525
+    },
+    {
+      "epoch": 2.8759398496240602,
+      "grad_norm": 0.5512478351593018,
+      "learning_rate": 1.4434921389133494e-07,
+      "loss": 0.0656,
+      "step": 1530
+    },
+    {
+      "epoch": 2.8853383458646618,
+      "grad_norm": 0.5502645373344421,
+      "learning_rate": 1.2363700381107901e-07,
+      "loss": 0.0612,
+      "step": 1535
+    },
+    {
+      "epoch": 2.8947368421052633,
+      "grad_norm": 0.5304490923881531,
+      "learning_rate": 1.0452190153913243e-07,
+      "loss": 0.0687,
+      "step": 1540
+    },
+    {
+      "epoch": 2.904135338345865,
+      "grad_norm": 0.6445665955543518,
+      "learning_rate": 8.700595924907629e-08,
+      "loss": 0.065,
+      "step": 1545
+    },
+    {
+      "epoch": 2.9135338345864663,
+      "grad_norm": 0.6678323149681091,
+      "learning_rate": 7.10910574306628e-08,
+      "loss": 0.0743,
+      "step": 1550
+    },
+    {
+      "epoch": 2.922932330827068,
+      "grad_norm": 0.7011875510215759,
+      "learning_rate": 5.67789046879369e-08,
+      "loss": 0.0697,
+      "step": 1555
+    },
+    {
+      "epoch": 2.932330827067669,
+      "grad_norm": 0.5663780570030212,
+      "learning_rate": 4.4071037555796226e-08,
+      "loss": 0.0626,
+      "step": 1560
+    },
+    {
+      "epoch": 2.9417293233082704,
+      "grad_norm": 0.48351287841796875,
+      "learning_rate": 3.296882033503312e-08,
+      "loss": 0.0598,
+      "step": 1565
+    },
+    {
+      "epoch": 2.951127819548872,
+      "grad_norm": 0.6389005184173584,
+      "learning_rate": 2.3473444945863455e-08,
+      "loss": 0.0671,
+      "step": 1570
+    },
+    {
+      "epoch": 2.9605263157894735,
+      "grad_norm": 0.6084789633750916,
+      "learning_rate": 1.558593079996895e-08,
+      "loss": 0.0668,
+      "step": 1575
+    },
+    {
+      "epoch": 2.969924812030075,
+      "grad_norm": 0.5656338334083557,
+      "learning_rate": 9.307124691044778e-09,
+      "loss": 0.0579,
+      "step": 1580
+    },
+    {
+      "epoch": 2.9793233082706765,
+      "grad_norm": 0.575596809387207,
+      "learning_rate": 4.63770070389724e-09,
+      "loss": 0.0661,
+      "step": 1585
+    },
+    {
+      "epoch": 2.988721804511278,
+      "grad_norm": 0.6059058308601379,
+      "learning_rate": 1.5781601420733483e-09,
+      "loss": 0.0706,
+      "step": 1590
+    },
+    {
+      "epoch": 2.9981203007518795,
+      "grad_norm": 0.5142336487770081,
+      "learning_rate": 1.288314740371943e-10,
+      "loss": 0.0739,
+      "step": 1595
+    },
+    {
+      "epoch": 3.0,
+      "step": 1596,
+      "total_flos": 2.0415991178298655e+18,
+      "train_loss": 0.36059863125452574,
+      "train_runtime": 988.9937,
+      "train_samples_per_second": 51.619,
+      "train_steps_per_second": 1.614
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1596,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0415991178298655e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

instruct/12_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a8154637e4c88b70cc4102a58d8c3026c5c563e0bd391a459b70475f191e00e
+size 8273

instruct/12_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff