Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

23_128_e3_3e-5/adapter_config.json +39 -0
23_128_e3_3e-5/adapter_model.safetensors +3 -0
23_128_e3_3e-5/added_tokens.json +9 -0
23_128_e3_3e-5/all_results.json +9 -0
23_128_e3_3e-5/chat_template.jinja +62 -0
23_128_e3_3e-5/config.json +32 -0
23_128_e3_3e-5/merges.txt +0 -0
23_128_e3_3e-5/special_tokens_map.json +33 -0
23_128_e3_3e-5/tokenizer.json +0 -0
23_128_e3_3e-5/tokenizer_config.json +234 -0
23_128_e3_3e-5/train_results.json +9 -0
23_128_e3_3e-5/trainer_state.json +1555 -0
23_128_e3_3e-5/training_args.bin +3 -0
23_128_e3_3e-5/vocab.json +0 -0

23_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

23_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4522332be862a3855544fbe709e0aeacc3c362bdc8c7cd3cff1e62fc6be7239b
+size 791751704

23_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

23_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.570176137525461e+18,
+    "train_loss": 0.5801369934751276,
+    "train_runtime": 770.5931,
+    "train_samples": 11523,
+    "train_samples_per_second": 44.86,
+    "train_steps_per_second": 1.405
+}

23_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

23_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

23_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

23_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

23_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

23_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

23_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.570176137525461e+18,
+    "train_loss": 0.5801369934751276,
+    "train_runtime": 770.5931,
+    "train_samples": 11523,
+    "train_samples_per_second": 44.86,
+    "train_steps_per_second": 1.405
+}

23_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1555 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1083,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013869625520110958,
+      "grad_norm": 3.5365381240844727,
+      "learning_rate": 2.181818181818182e-06,
+      "loss": 1.5557,
+      "step": 5
+    },
+    {
+      "epoch": 0.027739251040221916,
+      "grad_norm": 2.3829522132873535,
+      "learning_rate": 4.90909090909091e-06,
+      "loss": 1.5375,
+      "step": 10
+    },
+    {
+      "epoch": 0.04160887656033287,
+      "grad_norm": 1.0610990524291992,
+      "learning_rate": 7.636363636363636e-06,
+      "loss": 1.4319,
+      "step": 15
+    },
+    {
+      "epoch": 0.05547850208044383,
+      "grad_norm": 0.4579370617866516,
+      "learning_rate": 1.0363636363636364e-05,
+      "loss": 1.3718,
+      "step": 20
+    },
+    {
+      "epoch": 0.06934812760055478,
+      "grad_norm": 0.5088579058647156,
+      "learning_rate": 1.309090909090909e-05,
+      "loss": 1.3552,
+      "step": 25
+    },
+    {
+      "epoch": 0.08321775312066575,
+      "grad_norm": 0.4326176345348358,
+      "learning_rate": 1.5818181818181818e-05,
+      "loss": 1.3219,
+      "step": 30
+    },
+    {
+      "epoch": 0.0970873786407767,
+      "grad_norm": 0.3682642877101898,
+      "learning_rate": 1.8545454545454545e-05,
+      "loss": 1.3039,
+      "step": 35
+    },
+    {
+      "epoch": 0.11095700416088766,
+      "grad_norm": 0.34265801310539246,
+      "learning_rate": 2.1272727272727273e-05,
+      "loss": 1.2671,
+      "step": 40
+    },
+    {
+      "epoch": 0.12482662968099861,
+      "grad_norm": 0.3712596297264099,
+      "learning_rate": 2.4e-05,
+      "loss": 1.2873,
+      "step": 45
+    },
+    {
+      "epoch": 0.13869625520110956,
+      "grad_norm": 0.39635661244392395,
+      "learning_rate": 2.6727272727272728e-05,
+      "loss": 1.2484,
+      "step": 50
+    },
+    {
+      "epoch": 0.15256588072122051,
+      "grad_norm": 0.4278065860271454,
+      "learning_rate": 2.9454545454545456e-05,
+      "loss": 1.1698,
+      "step": 55
+    },
+    {
+      "epoch": 0.1664355062413315,
+      "grad_norm": 0.42952272295951843,
+      "learning_rate": 2.999887930004599e-05,
+      "loss": 1.1787,
+      "step": 60
+    },
+    {
+      "epoch": 0.18030513176144244,
+      "grad_norm": 0.39158961176872253,
+      "learning_rate": 2.9994326743489698e-05,
+      "loss": 1.2162,
+      "step": 65
+    },
+    {
+      "epoch": 0.1941747572815534,
+      "grad_norm": 0.4061707854270935,
+      "learning_rate": 2.998627334868592e-05,
+      "loss": 1.2209,
+      "step": 70
+    },
+    {
+      "epoch": 0.20804438280166435,
+      "grad_norm": 0.42585429549217224,
+      "learning_rate": 2.997472099592132e-05,
+      "loss": 1.1855,
+      "step": 75
+    },
+    {
+      "epoch": 0.22191400832177532,
+      "grad_norm": 0.40908199548721313,
+      "learning_rate": 2.9959672382410582e-05,
+      "loss": 1.1561,
+      "step": 80
+    },
+    {
+      "epoch": 0.23578363384188628,
+      "grad_norm": 0.36770713329315186,
+      "learning_rate": 2.9941131021666705e-05,
+      "loss": 1.1599,
+      "step": 85
+    },
+    {
+      "epoch": 0.24965325936199723,
+      "grad_norm": 0.4124533236026764,
+      "learning_rate": 2.991910124268063e-05,
+      "loss": 1.1141,
+      "step": 90
+    },
+    {
+      "epoch": 0.2635228848821082,
+      "grad_norm": 0.4241284728050232,
+      "learning_rate": 2.989358818891057e-05,
+      "loss": 1.1246,
+      "step": 95
+    },
+    {
+      "epoch": 0.27739251040221913,
+      "grad_norm": 0.407598614692688,
+      "learning_rate": 2.9864597817081083e-05,
+      "loss": 1.084,
+      "step": 100
+    },
+    {
+      "epoch": 0.2912621359223301,
+      "grad_norm": 0.47541385889053345,
+      "learning_rate": 2.983213689579234e-05,
+      "loss": 1.1283,
+      "step": 105
+    },
+    {
+      "epoch": 0.30513176144244103,
+      "grad_norm": 0.5389872789382935,
+      "learning_rate": 2.9796213003939798e-05,
+      "loss": 1.0712,
+      "step": 110
+    },
+    {
+      "epoch": 0.31900138696255204,
+      "grad_norm": 0.4110129475593567,
+      "learning_rate": 2.975683452894469e-05,
+      "loss": 1.054,
+      "step": 115
+    },
+    {
+      "epoch": 0.332871012482663,
+      "grad_norm": 0.5302886366844177,
+      "learning_rate": 2.9714010664795768e-05,
+      "loss": 1.0475,
+      "step": 120
+    },
+    {
+      "epoch": 0.34674063800277394,
+      "grad_norm": 0.6185004115104675,
+      "learning_rate": 2.9667751409902703e-05,
+      "loss": 1.0688,
+      "step": 125
+    },
+    {
+      "epoch": 0.3606102635228849,
+      "grad_norm": 0.5345790386199951,
+      "learning_rate": 2.9618067564761684e-05,
+      "loss": 1.0575,
+      "step": 130
+    },
+    {
+      "epoch": 0.37447988904299584,
+      "grad_norm": 0.5369071960449219,
+      "learning_rate": 2.9564970729433746e-05,
+      "loss": 0.9853,
+      "step": 135
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "grad_norm": 0.5191687345504761,
+      "learning_rate": 2.95084733008364e-05,
+      "loss": 1.0162,
+      "step": 140
+    },
+    {
+      "epoch": 0.40221914008321774,
+      "grad_norm": 0.5663520693778992,
+      "learning_rate": 2.9448588469849266e-05,
+      "loss": 1.0032,
+      "step": 145
+    },
+    {
+      "epoch": 0.4160887656033287,
+      "grad_norm": 0.5394409894943237,
+      "learning_rate": 2.9385330218234265e-05,
+      "loss": 0.9938,
+      "step": 150
+    },
+    {
+      "epoch": 0.42995839112343964,
+      "grad_norm": 0.586039125919342,
+      "learning_rate": 2.9318713315371213e-05,
+      "loss": 0.9669,
+      "step": 155
+    },
+    {
+      "epoch": 0.44382801664355065,
+      "grad_norm": 0.5684604644775391,
+      "learning_rate": 2.9248753314809497e-05,
+      "loss": 1.0014,
+      "step": 160
+    },
+    {
+      "epoch": 0.4576976421636616,
+      "grad_norm": 0.5540727376937866,
+      "learning_rate": 2.9175466550636657e-05,
+      "loss": 0.954,
+      "step": 165
+    },
+    {
+      "epoch": 0.47156726768377255,
+      "grad_norm": 0.609226644039154,
+      "learning_rate": 2.909887013366477e-05,
+      "loss": 0.9896,
+      "step": 170
+    },
+    {
+      "epoch": 0.4854368932038835,
+      "grad_norm": 0.7475889921188354,
+      "learning_rate": 2.9018981947435438e-05,
+      "loss": 0.9608,
+      "step": 175
+    },
+    {
+      "epoch": 0.49930651872399445,
+      "grad_norm": 0.6841046810150146,
+      "learning_rate": 2.8935820644044398e-05,
+      "loss": 0.9364,
+      "step": 180
+    },
+    {
+      "epoch": 0.5131761442441054,
+      "grad_norm": 0.6144030094146729,
+      "learning_rate": 2.8849405639786668e-05,
+      "loss": 0.9908,
+      "step": 185
+    },
+    {
+      "epoch": 0.5270457697642164,
+      "grad_norm": 0.6701822280883789,
+      "learning_rate": 2.8759757110623273e-05,
+      "loss": 0.9376,
+      "step": 190
+    },
+    {
+      "epoch": 0.5409153952843273,
+      "grad_norm": 0.7286859750747681,
+      "learning_rate": 2.8666895987470625e-05,
+      "loss": 0.9144,
+      "step": 195
+    },
+    {
+      "epoch": 0.5547850208044383,
+      "grad_norm": 0.7248000502586365,
+      "learning_rate": 2.8570843951313625e-05,
+      "loss": 0.96,
+      "step": 200
+    },
+    {
+      "epoch": 0.5686546463245492,
+      "grad_norm": 0.7224289178848267,
+      "learning_rate": 2.8471623428143625e-05,
+      "loss": 0.8985,
+      "step": 205
+    },
+    {
+      "epoch": 0.5825242718446602,
+      "grad_norm": 0.76520174741745,
+      "learning_rate": 2.836925758372247e-05,
+      "loss": 0.933,
+      "step": 210
+    },
+    {
+      "epoch": 0.5963938973647711,
+      "grad_norm": 0.7152209281921387,
+      "learning_rate": 2.8263770318173835e-05,
+      "loss": 0.8749,
+      "step": 215
+    },
+    {
+      "epoch": 0.6102635228848821,
+      "grad_norm": 0.7630525827407837,
+      "learning_rate": 2.815518626040304e-05,
+      "loss": 0.8956,
+      "step": 220
+    },
+    {
+      "epoch": 0.624133148404993,
+      "grad_norm": 0.7709964513778687,
+      "learning_rate": 2.804353076234679e-05,
+      "loss": 0.8379,
+      "step": 225
+    },
+    {
+      "epoch": 0.6380027739251041,
+      "grad_norm": 0.77178555727005,
+      "learning_rate": 2.792882989305405e-05,
+      "loss": 0.8558,
+      "step": 230
+    },
+    {
+      "epoch": 0.651872399445215,
+      "grad_norm": 0.7530989050865173,
+      "learning_rate": 2.7811110432599526e-05,
+      "loss": 0.8173,
+      "step": 235
+    },
+    {
+      "epoch": 0.665742024965326,
+      "grad_norm": 0.7799367904663086,
+      "learning_rate": 2.7690399865831098e-05,
+      "loss": 0.8386,
+      "step": 240
+    },
+    {
+      "epoch": 0.6796116504854369,
+      "grad_norm": 0.7575778365135193,
+      "learning_rate": 2.7566726375952737e-05,
+      "loss": 0.7842,
+      "step": 245
+    },
+    {
+      "epoch": 0.6934812760055479,
+      "grad_norm": 0.737149178981781,
+      "learning_rate": 2.7440118837944356e-05,
+      "loss": 0.833,
+      "step": 250
+    },
+    {
+      "epoch": 0.7073509015256588,
+      "grad_norm": 0.8471119403839111,
+      "learning_rate": 2.7310606811820153e-05,
+      "loss": 0.8197,
+      "step": 255
+    },
+    {
+      "epoch": 0.7212205270457698,
+      "grad_norm": 0.7941174507141113,
+      "learning_rate": 2.7178220535727026e-05,
+      "loss": 0.7819,
+      "step": 260
+    },
+    {
+      "epoch": 0.7350901525658807,
+      "grad_norm": 0.8318464756011963,
+      "learning_rate": 2.7042990918884612e-05,
+      "loss": 0.795,
+      "step": 265
+    },
+    {
+      "epoch": 0.7489597780859917,
+      "grad_norm": 0.8417139053344727,
+      "learning_rate": 2.6904949534368724e-05,
+      "loss": 0.795,
+      "step": 270
+    },
+    {
+      "epoch": 0.7628294036061026,
+      "grad_norm": 0.8773959875106812,
+      "learning_rate": 2.6764128611739704e-05,
+      "loss": 0.806,
+      "step": 275
+    },
+    {
+      "epoch": 0.7766990291262136,
+      "grad_norm": 0.6922762989997864,
+      "learning_rate": 2.662056102951756e-05,
+      "loss": 0.7634,
+      "step": 280
+    },
+    {
+      "epoch": 0.7905686546463245,
+      "grad_norm": 0.8595651388168335,
+      "learning_rate": 2.6474280307505595e-05,
+      "loss": 0.7602,
+      "step": 285
+    },
+    {
+      "epoch": 0.8044382801664355,
+      "grad_norm": 0.88811856508255,
+      "learning_rate": 2.6325320598964256e-05,
+      "loss": 0.794,
+      "step": 290
+    },
+    {
+      "epoch": 0.8183079056865464,
+      "grad_norm": 0.787564218044281,
+      "learning_rate": 2.617371668263712e-05,
+      "loss": 0.7853,
+      "step": 295
+    },
+    {
+      "epoch": 0.8321775312066574,
+      "grad_norm": 0.9549404978752136,
+      "learning_rate": 2.6019503954630847e-05,
+      "loss": 0.7034,
+      "step": 300
+    },
+    {
+      "epoch": 0.8460471567267683,
+      "grad_norm": 0.8131564259529114,
+      "learning_rate": 2.5862718420150993e-05,
+      "loss": 0.725,
+      "step": 305
+    },
+    {
+      "epoch": 0.8599167822468793,
+      "grad_norm": 0.9315174221992493,
+      "learning_rate": 2.570339668509558e-05,
+      "loss": 0.7695,
+      "step": 310
+    },
+    {
+      "epoch": 0.8737864077669902,
+      "grad_norm": 0.9266698360443115,
+      "learning_rate": 2.5541575947508464e-05,
+      "loss": 0.7054,
+      "step": 315
+    },
+    {
+      "epoch": 0.8876560332871013,
+      "grad_norm": 0.9440236687660217,
+      "learning_rate": 2.5377293988894407e-05,
+      "loss": 0.7321,
+      "step": 320
+    },
+    {
+      "epoch": 0.9015256588072122,
+      "grad_norm": 0.9590432047843933,
+      "learning_rate": 2.5210589165397978e-05,
+      "loss": 0.765,
+      "step": 325
+    },
+    {
+      "epoch": 0.9153952843273232,
+      "grad_norm": 0.9622156023979187,
+      "learning_rate": 2.5041500398848205e-05,
+      "loss": 0.7398,
+      "step": 330
+    },
+    {
+      "epoch": 0.9292649098474342,
+      "grad_norm": 0.9615751504898071,
+      "learning_rate": 2.487006716767126e-05,
+      "loss": 0.7344,
+      "step": 335
+    },
+    {
+      "epoch": 0.9431345353675451,
+      "grad_norm": 0.9424192309379578,
+      "learning_rate": 2.469632949767309e-05,
+      "loss": 0.6964,
+      "step": 340
+    },
+    {
+      "epoch": 0.957004160887656,
+      "grad_norm": 0.9014398455619812,
+      "learning_rate": 2.4520327952694307e-05,
+      "loss": 0.6889,
+      "step": 345
+    },
+    {
+      "epoch": 0.970873786407767,
+      "grad_norm": 1.1078535318374634,
+      "learning_rate": 2.4342103625139453e-05,
+      "loss": 0.6639,
+      "step": 350
+    },
+    {
+      "epoch": 0.984743411927878,
+      "grad_norm": 0.9644288420677185,
+      "learning_rate": 2.4161698126382822e-05,
+      "loss": 0.6666,
+      "step": 355
+    },
+    {
+      "epoch": 0.9986130374479889,
+      "grad_norm": 0.9670777320861816,
+      "learning_rate": 2.397915357705317e-05,
+      "loss": 0.6418,
+      "step": 360
+    },
+    {
+      "epoch": 1.0110957004160888,
+      "grad_norm": 0.8129168152809143,
+      "learning_rate": 2.379451259719947e-05,
+      "loss": 0.5865,
+      "step": 365
+    },
+    {
+      "epoch": 1.0249653259361997,
+      "grad_norm": 1.0520267486572266,
+      "learning_rate": 2.36078182963401e-05,
+      "loss": 0.5779,
+      "step": 370
+    },
+    {
+      "epoch": 1.0388349514563107,
+      "grad_norm": 0.9031832218170166,
+      "learning_rate": 2.341911426339774e-05,
+      "loss": 0.5917,
+      "step": 375
+    },
+    {
+      "epoch": 1.0527045769764216,
+      "grad_norm": 0.9084670543670654,
+      "learning_rate": 2.3228444556522334e-05,
+      "loss": 0.5727,
+      "step": 380
+    },
+    {
+      "epoch": 1.0665742024965326,
+      "grad_norm": 0.9397030472755432,
+      "learning_rate": 2.3035853692804503e-05,
+      "loss": 0.6012,
+      "step": 385
+    },
+    {
+      "epoch": 1.0804438280166435,
+      "grad_norm": 1.0912103652954102,
+      "learning_rate": 2.2841386637881806e-05,
+      "loss": 0.5945,
+      "step": 390
+    },
+    {
+      "epoch": 1.0943134535367545,
+      "grad_norm": 0.9629794359207153,
+      "learning_rate": 2.2645088795440284e-05,
+      "loss": 0.5772,
+      "step": 395
+    },
+    {
+      "epoch": 1.1081830790568654,
+      "grad_norm": 0.9905003905296326,
+      "learning_rate": 2.24470059966137e-05,
+      "loss": 0.566,
+      "step": 400
+    },
+    {
+      "epoch": 1.1220527045769764,
+      "grad_norm": 1.0485916137695312,
+      "learning_rate": 2.224718448928301e-05,
+      "loss": 0.5995,
+      "step": 405
+    },
+    {
+      "epoch": 1.1359223300970873,
+      "grad_norm": 1.044230341911316,
+      "learning_rate": 2.2045670927278534e-05,
+      "loss": 0.5589,
+      "step": 410
+    },
+    {
+      "epoch": 1.1497919556171983,
+      "grad_norm": 1.0527745485305786,
+      "learning_rate": 2.184251235948731e-05,
+      "loss": 0.5777,
+      "step": 415
+    },
+    {
+      "epoch": 1.1636615811373092,
+      "grad_norm": 1.0752713680267334,
+      "learning_rate": 2.1637756218868253e-05,
+      "loss": 0.5754,
+      "step": 420
+    },
+    {
+      "epoch": 1.1775312066574202,
+      "grad_norm": 1.028911828994751,
+      "learning_rate": 2.1431450311377632e-05,
+      "loss": 0.5502,
+      "step": 425
+    },
+    {
+      "epoch": 1.1914008321775311,
+      "grad_norm": 1.1634910106658936,
+      "learning_rate": 2.1223642804807436e-05,
+      "loss": 0.5309,
+      "step": 430
+    },
+    {
+      "epoch": 1.205270457697642,
+      "grad_norm": 0.9192670583724976,
+      "learning_rate": 2.1014382217539285e-05,
+      "loss": 0.5303,
+      "step": 435
+    },
+    {
+      "epoch": 1.219140083217753,
+      "grad_norm": 1.0747705698013306,
+      "learning_rate": 2.080371740721649e-05,
+      "loss": 0.5364,
+      "step": 440
+    },
+    {
+      "epoch": 1.233009708737864,
+      "grad_norm": 1.1691886186599731,
+      "learning_rate": 2.059169755933686e-05,
+      "loss": 0.5474,
+      "step": 445
+    },
+    {
+      "epoch": 1.246879334257975,
+      "grad_norm": 1.0536645650863647,
+      "learning_rate": 2.0378372175769038e-05,
+      "loss": 0.5593,
+      "step": 450
+    },
+    {
+      "epoch": 1.2607489597780859,
+      "grad_norm": 1.0917811393737793,
+      "learning_rate": 2.0163791063194886e-05,
+      "loss": 0.5813,
+      "step": 455
+    },
+    {
+      "epoch": 1.2746185852981968,
+      "grad_norm": 1.064706563949585,
+      "learning_rate": 1.994800432148077e-05,
+      "loss": 0.4771,
+      "step": 460
+    },
+    {
+      "epoch": 1.2884882108183078,
+      "grad_norm": 0.9680957198143005,
+      "learning_rate": 1.9731062331980365e-05,
+      "loss": 0.489,
+      "step": 465
+    },
+    {
+      "epoch": 1.3023578363384187,
+      "grad_norm": 1.2708544731140137,
+      "learning_rate": 1.9513015745771726e-05,
+      "loss": 0.5409,
+      "step": 470
+    },
+    {
+      "epoch": 1.31622746185853,
+      "grad_norm": 1.0526955127716064,
+      "learning_rate": 1.9293915471831415e-05,
+      "loss": 0.4968,
+      "step": 475
+    },
+    {
+      "epoch": 1.3300970873786409,
+      "grad_norm": 1.0505400896072388,
+      "learning_rate": 1.9073812665148393e-05,
+      "loss": 0.5261,
+      "step": 480
+    },
+    {
+      "epoch": 1.3439667128987518,
+      "grad_norm": 1.1070711612701416,
+      "learning_rate": 1.8852758714780477e-05,
+      "loss": 0.5514,
+      "step": 485
+    },
+    {
+      "epoch": 1.3578363384188628,
+      "grad_norm": 1.0973165035247803,
+      "learning_rate": 1.863080523185617e-05,
+      "loss": 0.4621,
+      "step": 490
+    },
+    {
+      "epoch": 1.3717059639389737,
+      "grad_norm": 1.1057963371276855,
+      "learning_rate": 1.8408004037524584e-05,
+      "loss": 0.51,
+      "step": 495
+    },
+    {
+      "epoch": 1.3855755894590847,
+      "grad_norm": 1.1990114450454712,
+      "learning_rate": 1.8184407150856406e-05,
+      "loss": 0.4617,
+      "step": 500
+    },
+    {
+      "epoch": 1.3994452149791956,
+      "grad_norm": 1.1677286624908447,
+      "learning_rate": 1.7960066776698592e-05,
+      "loss": 0.506,
+      "step": 505
+    },
+    {
+      "epoch": 1.4133148404993066,
+      "grad_norm": 1.134766936302185,
+      "learning_rate": 1.773503529348572e-05,
+      "loss": 0.4288,
+      "step": 510
+    },
+    {
+      "epoch": 1.4271844660194175,
+      "grad_norm": 1.0339175462722778,
+      "learning_rate": 1.7509365241010787e-05,
+      "loss": 0.4849,
+      "step": 515
+    },
+    {
+      "epoch": 1.4410540915395285,
+      "grad_norm": 1.139906644821167,
+      "learning_rate": 1.7283109308158364e-05,
+      "loss": 0.4622,
+      "step": 520
+    },
+    {
+      "epoch": 1.4549237170596394,
+      "grad_norm": 1.0931806564331055,
+      "learning_rate": 1.7056320320602925e-05,
+      "loss": 0.4767,
+      "step": 525
+    },
+    {
+      "epoch": 1.4687933425797504,
+      "grad_norm": 1.250899076461792,
+      "learning_rate": 1.6829051228475236e-05,
+      "loss": 0.5131,
+      "step": 530
+    },
+    {
+      "epoch": 1.4826629680998613,
+      "grad_norm": 1.1392693519592285,
+      "learning_rate": 1.660135509399967e-05,
+      "loss": 0.4452,
+      "step": 535
+    },
+    {
+      "epoch": 1.4965325936199723,
+      "grad_norm": 1.2967441082000732,
+      "learning_rate": 1.6373285079105413e-05,
+      "loss": 0.4756,
+      "step": 540
+    },
+    {
+      "epoch": 1.5104022191400832,
+      "grad_norm": 1.1215041875839233,
+      "learning_rate": 1.6144894433014314e-05,
+      "loss": 0.4751,
+      "step": 545
+    },
+    {
+      "epoch": 1.5242718446601942,
+      "grad_norm": 1.2440824508666992,
+      "learning_rate": 1.59162364798084e-05,
+      "loss": 0.451,
+      "step": 550
+    },
+    {
+      "epoch": 1.5381414701803051,
+      "grad_norm": 1.1779048442840576,
+      "learning_rate": 1.5687364605979893e-05,
+      "loss": 0.4538,
+      "step": 555
+    },
+    {
+      "epoch": 1.552011095700416,
+      "grad_norm": 1.144081473350525,
+      "learning_rate": 1.545833224796668e-05,
+      "loss": 0.4338,
+      "step": 560
+    },
+    {
+      "epoch": 1.565880721220527,
+      "grad_norm": 1.4217594861984253,
+      "learning_rate": 1.522919287967611e-05,
+      "loss": 0.4848,
+      "step": 565
+    },
+    {
+      "epoch": 1.579750346740638,
+      "grad_norm": 1.1857529878616333,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4477,
+      "step": 570
+    },
+    {
+      "epoch": 1.593619972260749,
+      "grad_norm": 1.076365351676941,
+      "learning_rate": 1.4770807120323892e-05,
+      "loss": 0.4583,
+      "step": 575
+    },
+    {
+      "epoch": 1.6074895977808599,
+      "grad_norm": 1.0323117971420288,
+      "learning_rate": 1.4541667752033318e-05,
+      "loss": 0.449,
+      "step": 580
+    },
+    {
+      "epoch": 1.6213592233009708,
+      "grad_norm": 1.0973401069641113,
+      "learning_rate": 1.4312635394020115e-05,
+      "loss": 0.4645,
+      "step": 585
+    },
+    {
+      "epoch": 1.635228848821082,
+      "grad_norm": 1.111819863319397,
+      "learning_rate": 1.4083763520191603e-05,
+      "loss": 0.4419,
+      "step": 590
+    },
+    {
+      "epoch": 1.649098474341193,
+      "grad_norm": 1.1761705875396729,
+      "learning_rate": 1.3855105566985689e-05,
+      "loss": 0.4248,
+      "step": 595
+    },
+    {
+      "epoch": 1.662968099861304,
+      "grad_norm": 1.0141918659210205,
+      "learning_rate": 1.3626714920894587e-05,
+      "loss": 0.4308,
+      "step": 600
+    },
+    {
+      "epoch": 1.6768377253814148,
+      "grad_norm": 1.1948453187942505,
+      "learning_rate": 1.3398644906000333e-05,
+      "loss": 0.3951,
+      "step": 605
+    },
+    {
+      "epoch": 1.6907073509015258,
+      "grad_norm": 1.2271469831466675,
+      "learning_rate": 1.317094877152477e-05,
+      "loss": 0.4426,
+      "step": 610
+    },
+    {
+      "epoch": 1.7045769764216367,
+      "grad_norm": 1.2232234477996826,
+      "learning_rate": 1.2943679679397079e-05,
+      "loss": 0.4182,
+      "step": 615
+    },
+    {
+      "epoch": 1.7184466019417477,
+      "grad_norm": 1.0209096670150757,
+      "learning_rate": 1.2716890691841635e-05,
+      "loss": 0.414,
+      "step": 620
+    },
+    {
+      "epoch": 1.7323162274618586,
+      "grad_norm": 1.1485804319381714,
+      "learning_rate": 1.2490634758989217e-05,
+      "loss": 0.3952,
+      "step": 625
+    },
+    {
+      "epoch": 1.7461858529819696,
+      "grad_norm": 1.2227532863616943,
+      "learning_rate": 1.2264964706514284e-05,
+      "loss": 0.4378,
+      "step": 630
+    },
+    {
+      "epoch": 1.7600554785020806,
+      "grad_norm": 1.1173542737960815,
+      "learning_rate": 1.203993322330141e-05,
+      "loss": 0.4062,
+      "step": 635
+    },
+    {
+      "epoch": 1.7739251040221915,
+      "grad_norm": 1.14585280418396,
+      "learning_rate": 1.1815592849143598e-05,
+      "loss": 0.4399,
+      "step": 640
+    },
+    {
+      "epoch": 1.7877947295423025,
+      "grad_norm": 1.1070749759674072,
+      "learning_rate": 1.159199596247542e-05,
+      "loss": 0.3936,
+      "step": 645
+    },
+    {
+      "epoch": 1.8016643550624134,
+      "grad_norm": 1.1440484523773193,
+      "learning_rate": 1.1369194768143839e-05,
+      "loss": 0.3736,
+      "step": 650
+    },
+    {
+      "epoch": 1.8155339805825244,
+      "grad_norm": 1.1345036029815674,
+      "learning_rate": 1.114724128521952e-05,
+      "loss": 0.4742,
+      "step": 655
+    },
+    {
+      "epoch": 1.8294036061026353,
+      "grad_norm": 1.1359888315200806,
+      "learning_rate": 1.092618733485161e-05,
+      "loss": 0.4327,
+      "step": 660
+    },
+    {
+      "epoch": 1.8432732316227463,
+      "grad_norm": 1.1387335062026978,
+      "learning_rate": 1.0706084528168589e-05,
+      "loss": 0.4248,
+      "step": 665
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 1.254227638244629,
+      "learning_rate": 1.0486984254228275e-05,
+      "loss": 0.3961,
+      "step": 670
+    },
+    {
+      "epoch": 1.8710124826629682,
+      "grad_norm": 1.0847305059432983,
+      "learning_rate": 1.0268937668019636e-05,
+      "loss": 0.4528,
+      "step": 675
+    },
+    {
+      "epoch": 1.884882108183079,
+      "grad_norm": 1.1612433195114136,
+      "learning_rate": 1.0051995678519233e-05,
+      "loss": 0.3747,
+      "step": 680
+    },
+    {
+      "epoch": 1.89875173370319,
+      "grad_norm": 1.1275752782821655,
+      "learning_rate": 9.836208936805113e-06,
+      "loss": 0.3912,
+      "step": 685
+    },
+    {
+      "epoch": 1.912621359223301,
+      "grad_norm": 1.2543262243270874,
+      "learning_rate": 9.621627824230963e-06,
+      "loss": 0.3948,
+      "step": 690
+    },
+    {
+      "epoch": 1.926490984743412,
+      "grad_norm": 1.1024737358093262,
+      "learning_rate": 9.408302440663143e-06,
+      "loss": 0.3871,
+      "step": 695
+    },
+    {
+      "epoch": 1.940360610263523,
+      "grad_norm": 1.3132935762405396,
+      "learning_rate": 9.196282592783514e-06,
+      "loss": 0.4236,
+      "step": 700
+    },
+    {
+      "epoch": 1.9542302357836339,
+      "grad_norm": 1.0697059631347656,
+      "learning_rate": 8.985617782460714e-06,
+      "loss": 0.3414,
+      "step": 705
+    },
+    {
+      "epoch": 1.9680998613037448,
+      "grad_norm": 1.2118357419967651,
+      "learning_rate": 8.776357195192566e-06,
+      "loss": 0.3902,
+      "step": 710
+    },
+    {
+      "epoch": 1.9819694868238558,
+      "grad_norm": 1.2814265489578247,
+      "learning_rate": 8.568549688622365e-06,
+      "loss": 0.3818,
+      "step": 715
+    },
+    {
+      "epoch": 1.9958391123439667,
+      "grad_norm": 1.2795014381408691,
+      "learning_rate": 8.362243781131748e-06,
+      "loss": 0.3773,
+      "step": 720
+    },
+    {
+      "epoch": 2.0083217753120666,
+      "grad_norm": 1.116100788116455,
+      "learning_rate": 8.157487640512692e-06,
+      "loss": 0.3354,
+      "step": 725
+    },
+    {
+      "epoch": 2.0221914008321775,
+      "grad_norm": 1.1351886987686157,
+      "learning_rate": 7.954329072721467e-06,
+      "loss": 0.2944,
+      "step": 730
+    },
+    {
+      "epoch": 2.0360610263522885,
+      "grad_norm": 1.110550045967102,
+      "learning_rate": 7.752815510716992e-06,
+      "loss": 0.3261,
+      "step": 735
+    },
+    {
+      "epoch": 2.0499306518723994,
+      "grad_norm": 1.0575944185256958,
+      "learning_rate": 7.552994003386302e-06,
+      "loss": 0.3348,
+      "step": 740
+    },
+    {
+      "epoch": 2.0638002773925104,
+      "grad_norm": 1.5008807182312012,
+      "learning_rate": 7.354911204559719e-06,
+      "loss": 0.3089,
+      "step": 745
+    },
+    {
+      "epoch": 2.0776699029126213,
+      "grad_norm": 1.1893939971923828,
+      "learning_rate": 7.158613362118194e-06,
+      "loss": 0.3377,
+      "step": 750
+    },
+    {
+      "epoch": 2.0915395284327323,
+      "grad_norm": 1.2049462795257568,
+      "learning_rate": 6.964146307195498e-06,
+      "loss": 0.3468,
+      "step": 755
+    },
+    {
+      "epoch": 2.1054091539528432,
+      "grad_norm": 1.0416041612625122,
+      "learning_rate": 6.7715554434776715e-06,
+      "loss": 0.2835,
+      "step": 760
+    },
+    {
+      "epoch": 2.119278779472954,
+      "grad_norm": 1.0391777753829956,
+      "learning_rate": 6.5808857366022615e-06,
+      "loss": 0.2931,
+      "step": 765
+    },
+    {
+      "epoch": 2.133148404993065,
+      "grad_norm": 0.995266318321228,
+      "learning_rate": 6.392181703659902e-06,
+      "loss": 0.3276,
+      "step": 770
+    },
+    {
+      "epoch": 2.147018030513176,
+      "grad_norm": 1.0826492309570312,
+      "learning_rate": 6.205487402800536e-06,
+      "loss": 0.2717,
+      "step": 775
+    },
+    {
+      "epoch": 2.160887656033287,
+      "grad_norm": 1.1436688899993896,
+      "learning_rate": 6.020846422946834e-06,
+      "loss": 0.3191,
+      "step": 780
+    },
+    {
+      "epoch": 2.174757281553398,
+      "grad_norm": 1.0932196378707886,
+      "learning_rate": 5.838301873617179e-06,
+      "loss": 0.2892,
+      "step": 785
+    },
+    {
+      "epoch": 2.188626907073509,
+      "grad_norm": 1.093634009361267,
+      "learning_rate": 5.657896374860552e-06,
+      "loss": 0.308,
+      "step": 790
+    },
+    {
+      "epoch": 2.20249653259362,
+      "grad_norm": 1.2233773469924927,
+      "learning_rate": 5.4796720473056935e-06,
+      "loss": 0.2932,
+      "step": 795
+    },
+    {
+      "epoch": 2.216366158113731,
+      "grad_norm": 0.9919722676277161,
+      "learning_rate": 5.303670502326913e-06,
+      "loss": 0.3041,
+      "step": 800
+    },
+    {
+      "epoch": 2.230235783633842,
+      "grad_norm": 1.2569876909255981,
+      "learning_rate": 5.129932832328745e-06,
+      "loss": 0.2834,
+      "step": 805
+    },
+    {
+      "epoch": 2.2441054091539527,
+      "grad_norm": 1.0792068243026733,
+      "learning_rate": 4.958499601151797e-06,
+      "loss": 0.3329,
+      "step": 810
+    },
+    {
+      "epoch": 2.2579750346740637,
+      "grad_norm": 1.1449778079986572,
+      "learning_rate": 4.789410834602026e-06,
+      "loss": 0.276,
+      "step": 815
+    },
+    {
+      "epoch": 2.2718446601941746,
+      "grad_norm": 1.1770884990692139,
+      "learning_rate": 4.622706011105595e-06,
+      "loss": 0.3361,
+      "step": 820
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 1.098443627357483,
+      "learning_rate": 4.458424052491538e-06,
+      "loss": 0.2724,
+      "step": 825
+    },
+    {
+      "epoch": 2.2995839112343965,
+      "grad_norm": 1.2141444683074951,
+      "learning_rate": 4.296603314904423e-06,
+      "loss": 0.2852,
+      "step": 830
+    },
+    {
+      "epoch": 2.3134535367545075,
+      "grad_norm": 1.0824936628341675,
+      "learning_rate": 4.137281579849013e-06,
+      "loss": 0.3002,
+      "step": 835
+    },
+    {
+      "epoch": 2.3273231622746184,
+      "grad_norm": 1.0559806823730469,
+      "learning_rate": 3.980496045369155e-06,
+      "loss": 0.2937,
+      "step": 840
+    },
+    {
+      "epoch": 2.3411927877947294,
+      "grad_norm": 1.2606571912765503,
+      "learning_rate": 3.826283317362884e-06,
+      "loss": 0.3022,
+      "step": 845
+    },
+    {
+      "epoch": 2.3550624133148403,
+      "grad_norm": 1.0621739625930786,
+      "learning_rate": 3.674679401035749e-06,
+      "loss": 0.2795,
+      "step": 850
+    },
+    {
+      "epoch": 2.3689320388349513,
+      "grad_norm": 1.1661829948425293,
+      "learning_rate": 3.5257196924944045e-06,
+      "loss": 0.3178,
+      "step": 855
+    },
+    {
+      "epoch": 2.3828016643550622,
+      "grad_norm": 1.266578197479248,
+      "learning_rate": 3.379438970482443e-06,
+      "loss": 0.2827,
+      "step": 860
+    },
+    {
+      "epoch": 2.396671289875173,
+      "grad_norm": 1.097712516784668,
+      "learning_rate": 3.2358713882603036e-06,
+      "loss": 0.3157,
+      "step": 865
+    },
+    {
+      "epoch": 2.410540915395284,
+      "grad_norm": 1.1444973945617676,
+      "learning_rate": 3.095050465631278e-06,
+      "loss": 0.2888,
+      "step": 870
+    },
+    {
+      "epoch": 2.424410540915395,
+      "grad_norm": 1.0758591890335083,
+      "learning_rate": 2.957009081115389e-06,
+      "loss": 0.3349,
+      "step": 875
+    },
+    {
+      "epoch": 2.438280166435506,
+      "grad_norm": 1.0919588804244995,
+      "learning_rate": 2.821779464272977e-06,
+      "loss": 0.2807,
+      "step": 880
+    },
+    {
+      "epoch": 2.452149791955617,
+      "grad_norm": 1.2054446935653687,
+      "learning_rate": 2.6893931881798466e-06,
+      "loss": 0.2977,
+      "step": 885
+    },
+    {
+      "epoch": 2.466019417475728,
+      "grad_norm": 1.2223763465881348,
+      "learning_rate": 2.5598811620556495e-06,
+      "loss": 0.2835,
+      "step": 890
+    },
+    {
+      "epoch": 2.479889042995839,
+      "grad_norm": 1.2282813787460327,
+      "learning_rate": 2.4332736240472654e-06,
+      "loss": 0.2841,
+      "step": 895
+    },
+    {
+      "epoch": 2.49375866851595,
+      "grad_norm": 1.2218164205551147,
+      "learning_rate": 2.3096001341689036e-06,
+      "loss": 0.275,
+      "step": 900
+    },
+    {
+      "epoch": 2.507628294036061,
+      "grad_norm": 1.124374270439148,
+      "learning_rate": 2.188889567400477e-06,
+      "loss": 0.2744,
+      "step": 905
+    },
+    {
+      "epoch": 2.5214979195561718,
+      "grad_norm": 1.2298548221588135,
+      "learning_rate": 2.071170106945951e-06,
+      "loss": 0.2852,
+      "step": 910
+    },
+    {
+      "epoch": 2.5353675450762827,
+      "grad_norm": 1.0848749876022339,
+      "learning_rate": 1.9564692376532127e-06,
+      "loss": 0.2991,
+      "step": 915
+    },
+    {
+      "epoch": 2.5492371705963937,
+      "grad_norm": 1.07148277759552,
+      "learning_rate": 1.844813739596964e-06,
+      "loss": 0.3067,
+      "step": 920
+    },
+    {
+      "epoch": 2.5631067961165046,
+      "grad_norm": 1.166430950164795,
+      "learning_rate": 1.7362296818261652e-06,
+      "loss": 0.2818,
+      "step": 925
+    },
+    {
+      "epoch": 2.5769764216366156,
+      "grad_norm": 1.1408696174621582,
+      "learning_rate": 1.6307424162775287e-06,
+      "loss": 0.298,
+      "step": 930
+    },
+    {
+      "epoch": 2.5908460471567265,
+      "grad_norm": 1.0408048629760742,
+      "learning_rate": 1.5283765718563809e-06,
+      "loss": 0.3117,
+      "step": 935
+    },
+    {
+      "epoch": 2.6047156726768375,
+      "grad_norm": 1.1565581560134888,
+      "learning_rate": 1.4291560486863752e-06,
+      "loss": 0.3101,
+      "step": 940
+    },
+    {
+      "epoch": 2.6185852981969484,
+      "grad_norm": 1.0862784385681152,
+      "learning_rate": 1.3331040125293758e-06,
+      "loss": 0.2711,
+      "step": 945
+    },
+    {
+      "epoch": 2.63245492371706,
+      "grad_norm": 1.2783355712890625,
+      "learning_rate": 1.2402428893767315e-06,
+      "loss": 0.2767,
+      "step": 950
+    },
+    {
+      "epoch": 2.6463245492371708,
+      "grad_norm": 1.2085648775100708,
+      "learning_rate": 1.1505943602133345e-06,
+      "loss": 0.2795,
+      "step": 955
+    },
+    {
+      "epoch": 2.6601941747572817,
+      "grad_norm": 1.0757157802581787,
+      "learning_rate": 1.0641793559556017e-06,
+      "loss": 0.2807,
+      "step": 960
+    },
+    {
+      "epoch": 2.6740638002773927,
+      "grad_norm": 1.159952998161316,
+      "learning_rate": 9.810180525645634e-07,
+      "loss": 0.2989,
+      "step": 965
+    },
+    {
+      "epoch": 2.6879334257975036,
+      "grad_norm": 1.2329380512237549,
+      "learning_rate": 9.011298663352329e-07,
+      "loss": 0.2703,
+      "step": 970
+    },
+    {
+      "epoch": 2.7018030513176146,
+      "grad_norm": 1.0350621938705444,
+      "learning_rate": 8.24533449363345e-07,
+      "loss": 0.2726,
+      "step": 975
+    },
+    {
+      "epoch": 2.7156726768377255,
+      "grad_norm": 1.0657778978347778,
+      "learning_rate": 7.51246685190507e-07,
+      "loss": 0.292,
+      "step": 980
+    },
+    {
+      "epoch": 2.7295423023578365,
+      "grad_norm": 1.1591492891311646,
+      "learning_rate": 6.812866846287862e-07,
+      "loss": 0.246,
+      "step": 985
+    },
+    {
+      "epoch": 2.7434119278779474,
+      "grad_norm": 1.0214314460754395,
+      "learning_rate": 6.14669781765737e-07,
+      "loss": 0.2584,
+      "step": 990
+    },
+    {
+      "epoch": 2.7572815533980584,
+      "grad_norm": 1.1055316925048828,
+      "learning_rate": 5.514115301507378e-07,
+      "loss": 0.2678,
+      "step": 995
+    },
+    {
+      "epoch": 2.7711511789181693,
+      "grad_norm": 1.3228343725204468,
+      "learning_rate": 4.915266991636025e-07,
+      "loss": 0.278,
+      "step": 1000
+    },
+    {
+      "epoch": 2.7850208044382803,
+      "grad_norm": 1.067517876625061,
+      "learning_rate": 4.3502927056625783e-07,
+      "loss": 0.268,
+      "step": 1005
+    },
+    {
+      "epoch": 2.798890429958391,
+      "grad_norm": 1.1793876886367798,
+      "learning_rate": 3.8193243523831576e-07,
+      "loss": 0.2579,
+      "step": 1010
+    },
+    {
+      "epoch": 2.812760055478502,
+      "grad_norm": 1.1702144145965576,
+      "learning_rate": 3.322485900972955e-07,
+      "loss": 0.2861,
+      "step": 1015
+    },
+    {
+      "epoch": 2.826629680998613,
+      "grad_norm": 1.1405779123306274,
+      "learning_rate": 2.859893352042336e-07,
+      "loss": 0.2912,
+      "step": 1020
+    },
+    {
+      "epoch": 2.840499306518724,
+      "grad_norm": 1.1083163022994995,
+      "learning_rate": 2.4316547105531105e-07,
+      "loss": 0.2774,
+      "step": 1025
+    },
+    {
+      "epoch": 2.854368932038835,
+      "grad_norm": 1.222824215888977,
+      "learning_rate": 2.0378699606020457e-07,
+      "loss": 0.266,
+      "step": 1030
+    },
+    {
+      "epoch": 2.868238557558946,
+      "grad_norm": 1.1668026447296143,
+      "learning_rate": 1.678631042076595e-07,
+      "loss": 0.3207,
+      "step": 1035
+    },
+    {
+      "epoch": 2.882108183079057,
+      "grad_norm": 1.1818103790283203,
+      "learning_rate": 1.354021829189167e-07,
+      "loss": 0.2624,
+      "step": 1040
+    },
+    {
+      "epoch": 2.895977808599168,
+      "grad_norm": 1.205928921699524,
+      "learning_rate": 1.0641181108943076e-07,
+      "loss": 0.2645,
+      "step": 1045
+    },
+    {
+      "epoch": 2.909847434119279,
+      "grad_norm": 1.0949934720993042,
+      "learning_rate": 8.089875731937035e-08,
+      "loss": 0.2728,
+      "step": 1050
+    },
+    {
+      "epoch": 2.9237170596393898,
+      "grad_norm": 1.2244797945022583,
+      "learning_rate": 5.8868978333299076e-08,
+      "loss": 0.2727,
+      "step": 1055
+    },
+    {
+      "epoch": 2.9375866851595007,
+      "grad_norm": 1.1624984741210938,
+      "learning_rate": 4.0327617589417944e-08,
+      "loss": 0.2553,
+      "step": 1060
+    },
+    {
+      "epoch": 2.9514563106796117,
+      "grad_norm": 1.124219298362732,
+      "learning_rate": 2.5279004078681002e-08,
+      "loss": 0.2623,
+      "step": 1065
+    },
+    {
+      "epoch": 2.9653259361997226,
+      "grad_norm": 1.1501754522323608,
+      "learning_rate": 1.3726651314078797e-08,
+      "loss": 0.2648,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9791955617198336,
+      "grad_norm": 1.1457092761993408,
+      "learning_rate": 5.673256510301261e-09,
+      "loss": 0.2896,
+      "step": 1075
+    },
+    {
+      "epoch": 2.9930651872399445,
+      "grad_norm": 1.1434216499328613,
+      "learning_rate": 1.1206999540114282e-09,
+      "loss": 0.2839,
+      "step": 1080
+    },
+    {
+      "epoch": 3.0,
+      "step": 1083,
+      "total_flos": 1.570176137525461e+18,
+      "train_loss": 0.5801369934751276,
+      "train_runtime": 770.5931,
+      "train_samples_per_second": 44.86,
+      "train_steps_per_second": 1.405
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1083,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.570176137525461e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

23_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7c3dec8dd32725b07f8693b60d0d894db74049cc367f8cd165a488137f59372
+size 8209

23_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff