Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

25_128_e3_3e-5/adapter_config.json +39 -0
25_128_e3_3e-5/adapter_model.safetensors +3 -0
25_128_e3_3e-5/added_tokens.json +9 -0
25_128_e3_3e-5/all_results.json +9 -0
25_128_e3_3e-5/chat_template.jinja +62 -0
25_128_e3_3e-5/config.json +32 -0
25_128_e3_3e-5/merges.txt +0 -0
25_128_e3_3e-5/special_tokens_map.json +33 -0
25_128_e3_3e-5/tokenizer.json +0 -0
25_128_e3_3e-5/tokenizer_config.json +234 -0
25_128_e3_3e-5/train_results.json +9 -0
25_128_e3_3e-5/trainer_state.json +1849 -0
25_128_e3_3e-5/training_args.bin +3 -0
25_128_e3_3e-5/vocab.json +0 -0

25_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

25_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3cf93e027f1ee9e3522a63700eb55cc5ad270dac5cbabda195be29aa4ccfb53
+size 791751704

25_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

25_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0359870555944387e+18,
+    "train_loss": 0.5806684772450151,
+    "train_runtime": 1005.3243,
+    "train_samples": 13773,
+    "train_samples_per_second": 41.1,
+    "train_steps_per_second": 1.286
+}

25_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

25_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

25_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

25_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

25_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

25_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

25_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2.0359870555944387e+18,
+    "train_loss": 0.5806684772450151,
+    "train_runtime": 1005.3243,
+    "train_samples": 13773,
+    "train_samples_per_second": 41.1,
+    "train_steps_per_second": 1.286
+}

25_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1849 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1293,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011614401858304297,
+      "grad_norm": 3.7239036560058594,
+      "learning_rate": 1.8461538461538462e-06,
+      "loss": 1.5388,
+      "step": 5
+    },
+    {
+      "epoch": 0.023228803716608595,
+      "grad_norm": 2.49280047416687,
+      "learning_rate": 4.1538461538461545e-06,
+      "loss": 1.5279,
+      "step": 10
+    },
+    {
+      "epoch": 0.03484320557491289,
+      "grad_norm": 1.060119390487671,
+      "learning_rate": 6.461538461538462e-06,
+      "loss": 1.4208,
+      "step": 15
+    },
+    {
+      "epoch": 0.04645760743321719,
+      "grad_norm": 0.6498854756355286,
+      "learning_rate": 8.76923076923077e-06,
+      "loss": 1.3651,
+      "step": 20
+    },
+    {
+      "epoch": 0.05807200929152149,
+      "grad_norm": 0.4009834825992584,
+      "learning_rate": 1.1076923076923077e-05,
+      "loss": 1.3718,
+      "step": 25
+    },
+    {
+      "epoch": 0.06968641114982578,
+      "grad_norm": 0.6162559390068054,
+      "learning_rate": 1.3384615384615386e-05,
+      "loss": 1.3552,
+      "step": 30
+    },
+    {
+      "epoch": 0.08130081300813008,
+      "grad_norm": 0.33406174182891846,
+      "learning_rate": 1.5692307692307693e-05,
+      "loss": 1.2759,
+      "step": 35
+    },
+    {
+      "epoch": 0.09291521486643438,
+      "grad_norm": 0.41755595803260803,
+      "learning_rate": 1.8e-05,
+      "loss": 1.26,
+      "step": 40
+    },
+    {
+      "epoch": 0.10452961672473868,
+      "grad_norm": 0.331603467464447,
+      "learning_rate": 2.0307692307692308e-05,
+      "loss": 1.2596,
+      "step": 45
+    },
+    {
+      "epoch": 0.11614401858304298,
+      "grad_norm": 0.3664747476577759,
+      "learning_rate": 2.2615384615384615e-05,
+      "loss": 1.2887,
+      "step": 50
+    },
+    {
+      "epoch": 0.12775842044134728,
+      "grad_norm": 0.32520970702171326,
+      "learning_rate": 2.4923076923076926e-05,
+      "loss": 1.269,
+      "step": 55
+    },
+    {
+      "epoch": 0.13937282229965156,
+      "grad_norm": 0.3528161346912384,
+      "learning_rate": 2.7230769230769233e-05,
+      "loss": 1.2601,
+      "step": 60
+    },
+    {
+      "epoch": 0.15098722415795587,
+      "grad_norm": 0.3476129174232483,
+      "learning_rate": 2.953846153846154e-05,
+      "loss": 1.1839,
+      "step": 65
+    },
+    {
+      "epoch": 0.16260162601626016,
+      "grad_norm": 0.3519430458545685,
+      "learning_rate": 2.9999214618860097e-05,
+      "loss": 1.2509,
+      "step": 70
+    },
+    {
+      "epoch": 0.17421602787456447,
+      "grad_norm": 0.38617995381355286,
+      "learning_rate": 2.9996024148932534e-05,
+      "loss": 1.2605,
+      "step": 75
+    },
+    {
+      "epoch": 0.18583042973286876,
+      "grad_norm": 0.44555604457855225,
+      "learning_rate": 2.999038002552003e-05,
+      "loss": 1.166,
+      "step": 80
+    },
+    {
+      "epoch": 0.19744483159117304,
+      "grad_norm": 0.35843759775161743,
+      "learning_rate": 2.9982283172115566e-05,
+      "loss": 1.1625,
+      "step": 85
+    },
+    {
+      "epoch": 0.20905923344947736,
+      "grad_norm": 0.3689218759536743,
+      "learning_rate": 2.9971734913528465e-05,
+      "loss": 1.1718,
+      "step": 90
+    },
+    {
+      "epoch": 0.22067363530778164,
+      "grad_norm": 0.3973144292831421,
+      "learning_rate": 2.9958736975667613e-05,
+      "loss": 1.1546,
+      "step": 95
+    },
+    {
+      "epoch": 0.23228803716608595,
+      "grad_norm": 0.42336413264274597,
+      "learning_rate": 2.9943291485259094e-05,
+      "loss": 1.1256,
+      "step": 100
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.4604174494743347,
+      "learning_rate": 2.9925400969498173e-05,
+      "loss": 1.1204,
+      "step": 105
+    },
+    {
+      "epoch": 0.25551684088269455,
+      "grad_norm": 0.46538224816322327,
+      "learning_rate": 2.990506835563583e-05,
+      "loss": 1.0944,
+      "step": 110
+    },
+    {
+      "epoch": 0.26713124274099886,
+      "grad_norm": 0.3979334831237793,
+      "learning_rate": 2.988229697049979e-05,
+      "loss": 1.1457,
+      "step": 115
+    },
+    {
+      "epoch": 0.2787456445993031,
+      "grad_norm": 0.40539079904556274,
+      "learning_rate": 2.9857090539950177e-05,
+      "loss": 1.0839,
+      "step": 120
+    },
+    {
+      "epoch": 0.29036004645760743,
+      "grad_norm": 0.4645063579082489,
+      "learning_rate": 2.982945318826991e-05,
+      "loss": 1.0477,
+      "step": 125
+    },
+    {
+      "epoch": 0.30197444831591175,
+      "grad_norm": 0.4794134795665741,
+      "learning_rate": 2.979938943748987e-05,
+      "loss": 1.085,
+      "step": 130
+    },
+    {
+      "epoch": 0.313588850174216,
+      "grad_norm": 0.43697139620780945,
+      "learning_rate": 2.9766904206648997e-05,
+      "loss": 1.035,
+      "step": 135
+    },
+    {
+      "epoch": 0.3252032520325203,
+      "grad_norm": 0.47847357392311096,
+      "learning_rate": 2.9732002810989464e-05,
+      "loss": 1.0612,
+      "step": 140
+    },
+    {
+      "epoch": 0.33681765389082463,
+      "grad_norm": 0.5038856267929077,
+      "learning_rate": 2.969469096108697e-05,
+      "loss": 1.0862,
+      "step": 145
+    },
+    {
+      "epoch": 0.34843205574912894,
+      "grad_norm": 0.4716856777667999,
+      "learning_rate": 2.9654974761916377e-05,
+      "loss": 1.1585,
+      "step": 150
+    },
+    {
+      "epoch": 0.3600464576074332,
+      "grad_norm": 0.5352954864501953,
+      "learning_rate": 2.9612860711852828e-05,
+      "loss": 0.9909,
+      "step": 155
+    },
+    {
+      "epoch": 0.3716608594657375,
+      "grad_norm": 0.5641031861305237,
+      "learning_rate": 2.9568355701608463e-05,
+      "loss": 1.0579,
+      "step": 160
+    },
+    {
+      "epoch": 0.3832752613240418,
+      "grad_norm": 0.5172345042228699,
+      "learning_rate": 2.9521467013104973e-05,
+      "loss": 1.0648,
+      "step": 165
+    },
+    {
+      "epoch": 0.3948896631823461,
+      "grad_norm": 0.5852975249290466,
+      "learning_rate": 2.947220231828212e-05,
+      "loss": 1.0146,
+      "step": 170
+    },
+    {
+      "epoch": 0.4065040650406504,
+      "grad_norm": 0.6303030848503113,
+      "learning_rate": 2.9420569677842456e-05,
+      "loss": 0.9513,
+      "step": 175
+    },
+    {
+      "epoch": 0.4181184668989547,
+      "grad_norm": 0.6142143607139587,
+      "learning_rate": 2.9366577539932433e-05,
+      "loss": 0.9699,
+      "step": 180
+    },
+    {
+      "epoch": 0.429732868757259,
+      "grad_norm": 0.5549263954162598,
+      "learning_rate": 2.93102347387601e-05,
+      "loss": 0.9979,
+      "step": 185
+    },
+    {
+      "epoch": 0.4413472706155633,
+      "grad_norm": 0.6228923201560974,
+      "learning_rate": 2.925155049314967e-05,
+      "loss": 1.0035,
+      "step": 190
+    },
+    {
+      "epoch": 0.4529616724738676,
+      "grad_norm": 0.6589254140853882,
+      "learning_rate": 2.9190534405033108e-05,
+      "loss": 0.9252,
+      "step": 195
+    },
+    {
+      "epoch": 0.4645760743321719,
+      "grad_norm": 0.5856587290763855,
+      "learning_rate": 2.9127196457879096e-05,
+      "loss": 0.9695,
+      "step": 200
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.6193470358848572,
+      "learning_rate": 2.906154701505949e-05,
+      "loss": 0.9641,
+      "step": 205
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.6732288002967834,
+      "learning_rate": 2.8993596818153703e-05,
+      "loss": 0.9541,
+      "step": 210
+    },
+    {
+      "epoch": 0.4994192799070848,
+      "grad_norm": 0.6615795493125916,
+      "learning_rate": 2.8923356985191134e-05,
+      "loss": 0.9326,
+      "step": 215
+    },
+    {
+      "epoch": 0.5110336817653891,
+      "grad_norm": 0.6849666237831116,
+      "learning_rate": 2.885083900883205e-05,
+      "loss": 0.9057,
+      "step": 220
+    },
+    {
+      "epoch": 0.5226480836236934,
+      "grad_norm": 0.7001377940177917,
+      "learning_rate": 2.877605475448716e-05,
+      "loss": 0.9211,
+      "step": 225
+    },
+    {
+      "epoch": 0.5342624854819977,
+      "grad_norm": 0.6646133661270142,
+      "learning_rate": 2.8699016458376173e-05,
+      "loss": 0.9311,
+      "step": 230
+    },
+    {
+      "epoch": 0.5458768873403019,
+      "grad_norm": 0.6363524794578552,
+      "learning_rate": 2.861973672552571e-05,
+      "loss": 0.8821,
+      "step": 235
+    },
+    {
+      "epoch": 0.5574912891986062,
+      "grad_norm": 0.6724256277084351,
+      "learning_rate": 2.853822852770689e-05,
+      "loss": 0.9109,
+      "step": 240
+    },
+    {
+      "epoch": 0.5691056910569106,
+      "grad_norm": 0.6587971448898315,
+      "learning_rate": 2.845450520131285e-05,
+      "loss": 0.9068,
+      "step": 245
+    },
+    {
+      "epoch": 0.5807200929152149,
+      "grad_norm": 0.7322813868522644,
+      "learning_rate": 2.8368580445176668e-05,
+      "loss": 0.8869,
+      "step": 250
+    },
+    {
+      "epoch": 0.5923344947735192,
+      "grad_norm": 0.7255988717079163,
+      "learning_rate": 2.8280468318329934e-05,
+      "loss": 0.8568,
+      "step": 255
+    },
+    {
+      "epoch": 0.6039488966318235,
+      "grad_norm": 0.7958986759185791,
+      "learning_rate": 2.8190183237702433e-05,
+      "loss": 0.8255,
+      "step": 260
+    },
+    {
+      "epoch": 0.6155632984901278,
+      "grad_norm": 0.7678819298744202,
+      "learning_rate": 2.809773997576322e-05,
+      "loss": 0.8558,
+      "step": 265
+    },
+    {
+      "epoch": 0.627177700348432,
+      "grad_norm": 0.7441889047622681,
+      "learning_rate": 2.8003153658103547e-05,
+      "loss": 0.9033,
+      "step": 270
+    },
+    {
+      "epoch": 0.6387921022067363,
+      "grad_norm": 0.7110655307769775,
+      "learning_rate": 2.790643976096204e-05,
+      "loss": 0.8529,
+      "step": 275
+    },
+    {
+      "epoch": 0.6504065040650406,
+      "grad_norm": 0.7925458550453186,
+      "learning_rate": 2.7807614108692426e-05,
+      "loss": 0.8516,
+      "step": 280
+    },
+    {
+      "epoch": 0.662020905923345,
+      "grad_norm": 0.7953664064407349,
+      "learning_rate": 2.770669287117438e-05,
+      "loss": 0.8712,
+      "step": 285
+    },
+    {
+      "epoch": 0.6736353077816493,
+      "grad_norm": 0.8666931986808777,
+      "learning_rate": 2.7603692561167807e-05,
+      "loss": 0.769,
+      "step": 290
+    },
+    {
+      "epoch": 0.6852497096399536,
+      "grad_norm": 0.79376620054245,
+      "learning_rate": 2.7498630031610985e-05,
+      "loss": 0.863,
+      "step": 295
+    },
+    {
+      "epoch": 0.6968641114982579,
+      "grad_norm": 0.7933903336524963,
+      "learning_rate": 2.7391522472863123e-05,
+      "loss": 0.8059,
+      "step": 300
+    },
+    {
+      "epoch": 0.7084785133565621,
+      "grad_norm": 0.7302671074867249,
+      "learning_rate": 2.7282387409891653e-05,
+      "loss": 0.818,
+      "step": 305
+    },
+    {
+      "epoch": 0.7200929152148664,
+      "grad_norm": 0.8621637225151062,
+      "learning_rate": 2.7171242699404788e-05,
+      "loss": 0.7648,
+      "step": 310
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.7623394131660461,
+      "learning_rate": 2.705810652692981e-05,
+      "loss": 0.8269,
+      "step": 315
+    },
+    {
+      "epoch": 0.743321718931475,
+      "grad_norm": 0.784396767616272,
+      "learning_rate": 2.694299740383753e-05,
+      "loss": 0.8105,
+      "step": 320
+    },
+    {
+      "epoch": 0.7549361207897793,
+      "grad_norm": 0.8236751556396484,
+      "learning_rate": 2.6825934164313492e-05,
+      "loss": 0.824,
+      "step": 325
+    },
+    {
+      "epoch": 0.7665505226480837,
+      "grad_norm": 0.8309207558631897,
+      "learning_rate": 2.6706935962276268e-05,
+      "loss": 0.8326,
+      "step": 330
+    },
+    {
+      "epoch": 0.778164924506388,
+      "grad_norm": 0.8117490410804749,
+      "learning_rate": 2.6586022268243526e-05,
+      "loss": 0.7619,
+      "step": 335
+    },
+    {
+      "epoch": 0.7897793263646922,
+      "grad_norm": 0.8275243639945984,
+      "learning_rate": 2.6463212866146246e-05,
+      "loss": 0.7847,
+      "step": 340
+    },
+    {
+      "epoch": 0.8013937282229965,
+      "grad_norm": 0.7706731557846069,
+      "learning_rate": 2.633852785009168e-05,
+      "loss": 0.7217,
+      "step": 345
+    },
+    {
+      "epoch": 0.8130081300813008,
+      "grad_norm": 0.8583183288574219,
+      "learning_rate": 2.621198762107551e-05,
+      "loss": 0.7879,
+      "step": 350
+    },
+    {
+      "epoch": 0.8246225319396051,
+      "grad_norm": 0.8016027808189392,
+      "learning_rate": 2.6083612883643888e-05,
+      "loss": 0.7647,
+      "step": 355
+    },
+    {
+      "epoch": 0.8362369337979094,
+      "grad_norm": 0.8635758757591248,
+      "learning_rate": 2.595342464250571e-05,
+      "loss": 0.7656,
+      "step": 360
+    },
+    {
+      "epoch": 0.8478513356562137,
+      "grad_norm": 0.8534408211708069,
+      "learning_rate": 2.5821444199095833e-05,
+      "loss": 0.765,
+      "step": 365
+    },
+    {
+      "epoch": 0.859465737514518,
+      "grad_norm": 0.8564404249191284,
+      "learning_rate": 2.568769314808973e-05,
+      "loss": 0.743,
+      "step": 370
+    },
+    {
+      "epoch": 0.8710801393728222,
+      "grad_norm": 0.828381359577179,
+      "learning_rate": 2.5552193373870175e-05,
+      "loss": 0.6896,
+      "step": 375
+    },
+    {
+      "epoch": 0.8826945412311266,
+      "grad_norm": 0.8003427386283875,
+      "learning_rate": 2.5414967046946482e-05,
+      "loss": 0.7801,
+      "step": 380
+    },
+    {
+      "epoch": 0.8943089430894309,
+      "grad_norm": 0.9103115797042847,
+      "learning_rate": 2.5276036620327e-05,
+      "loss": 0.687,
+      "step": 385
+    },
+    {
+      "epoch": 0.9059233449477352,
+      "grad_norm": 0.899299681186676,
+      "learning_rate": 2.513542482584531e-05,
+      "loss": 0.7451,
+      "step": 390
+    },
+    {
+      "epoch": 0.9175377468060395,
+      "grad_norm": 0.9081295728683472,
+      "learning_rate": 2.4993154670440866e-05,
+      "loss": 0.718,
+      "step": 395
+    },
+    {
+      "epoch": 0.9291521486643438,
+      "grad_norm": 0.9310486912727356,
+      "learning_rate": 2.4849249432394568e-05,
+      "loss": 0.6786,
+      "step": 400
+    },
+    {
+      "epoch": 0.9407665505226481,
+      "grad_norm": 1.0035141706466675,
+      "learning_rate": 2.4703732657519984e-05,
+      "loss": 0.6999,
+      "step": 405
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.884985625743866,
+      "learning_rate": 2.4556628155310766e-05,
+      "loss": 0.7024,
+      "step": 410
+    },
+    {
+      "epoch": 0.9639953542392566,
+      "grad_norm": 0.8092682361602783,
+      "learning_rate": 2.4407959995044943e-05,
+      "loss": 0.667,
+      "step": 415
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.9335992932319641,
+      "learning_rate": 2.425775250184668e-05,
+      "loss": 0.7228,
+      "step": 420
+    },
+    {
+      "epoch": 0.9872241579558653,
+      "grad_norm": 0.8802455067634583,
+      "learning_rate": 2.4106030252706223e-05,
+      "loss": 0.6878,
+      "step": 425
+    },
+    {
+      "epoch": 0.9988385598141696,
+      "grad_norm": 0.9185948967933655,
+      "learning_rate": 2.3952818072458588e-05,
+      "loss": 0.6512,
+      "step": 430
+    },
+    {
+      "epoch": 1.0092915214866434,
+      "grad_norm": 0.885767936706543,
+      "learning_rate": 2.3798141029721706e-05,
+      "loss": 0.5856,
+      "step": 435
+    },
+    {
+      "epoch": 1.0209059233449478,
+      "grad_norm": 0.91026771068573,
+      "learning_rate": 2.3642024432794714e-05,
+      "loss": 0.5936,
+      "step": 440
+    },
+    {
+      "epoch": 1.032520325203252,
+      "grad_norm": 0.9716700911521912,
+      "learning_rate": 2.3484493825516985e-05,
+      "loss": 0.5798,
+      "step": 445
+    },
+    {
+      "epoch": 1.0441347270615564,
+      "grad_norm": 0.9666943550109863,
+      "learning_rate": 2.3325574983088652e-05,
+      "loss": 0.5409,
+      "step": 450
+    },
+    {
+      "epoch": 1.0557491289198606,
+      "grad_norm": 0.991735577583313,
+      "learning_rate": 2.3165293907853227e-05,
+      "loss": 0.5806,
+      "step": 455
+    },
+    {
+      "epoch": 1.0673635307781648,
+      "grad_norm": 0.9840120673179626,
+      "learning_rate": 2.3003676825043165e-05,
+      "loss": 0.5449,
+      "step": 460
+    },
+    {
+      "epoch": 1.0789779326364692,
+      "grad_norm": 0.908309817314148,
+      "learning_rate": 2.28407501784888e-05,
+      "loss": 0.5399,
+      "step": 465
+    },
+    {
+      "epoch": 1.0905923344947734,
+      "grad_norm": 0.895395815372467,
+      "learning_rate": 2.2676540626291643e-05,
+      "loss": 0.5555,
+      "step": 470
+    },
+    {
+      "epoch": 1.1022067363530779,
+      "grad_norm": 0.9629302620887756,
+      "learning_rate": 2.2511075036462583e-05,
+      "loss": 0.5488,
+      "step": 475
+    },
+    {
+      "epoch": 1.113821138211382,
+      "grad_norm": 0.9868640303611755,
+      "learning_rate": 2.2344380482525716e-05,
+      "loss": 0.6052,
+      "step": 480
+    },
+    {
+      "epoch": 1.1254355400696865,
+      "grad_norm": 0.9448416233062744,
+      "learning_rate": 2.217648423908857e-05,
+      "loss": 0.6069,
+      "step": 485
+    },
+    {
+      "epoch": 1.1370499419279907,
+      "grad_norm": 1.0542551279067993,
+      "learning_rate": 2.200741377737943e-05,
+      "loss": 0.5707,
+      "step": 490
+    },
+    {
+      "epoch": 1.1486643437862951,
+      "grad_norm": 0.9438780546188354,
+      "learning_rate": 2.18371967607525e-05,
+      "loss": 0.5595,
+      "step": 495
+    },
+    {
+      "epoch": 1.1602787456445993,
+      "grad_norm": 0.959209680557251,
+      "learning_rate": 2.1665861040161598e-05,
+      "loss": 0.582,
+      "step": 500
+    },
+    {
+      "epoch": 1.1718931475029035,
+      "grad_norm": 1.056666374206543,
+      "learning_rate": 2.149343464960318e-05,
+      "loss": 0.5928,
+      "step": 505
+    },
+    {
+      "epoch": 1.183507549361208,
+      "grad_norm": 0.9853843450546265,
+      "learning_rate": 2.1319945801529425e-05,
+      "loss": 0.5713,
+      "step": 510
+    },
+    {
+      "epoch": 1.1951219512195121,
+      "grad_norm": 1.1093785762786865,
+      "learning_rate": 2.1145422882232085e-05,
+      "loss": 0.5636,
+      "step": 515
+    },
+    {
+      "epoch": 1.2067363530778166,
+      "grad_norm": 1.055412769317627,
+      "learning_rate": 2.0969894447197927e-05,
+      "loss": 0.5334,
+      "step": 520
+    },
+    {
+      "epoch": 1.2183507549361208,
+      "grad_norm": 0.921072781085968,
+      "learning_rate": 2.0793389216436477e-05,
+      "loss": 0.5418,
+      "step": 525
+    },
+    {
+      "epoch": 1.229965156794425,
+      "grad_norm": 0.9304271340370178,
+      "learning_rate": 2.0615936069780826e-05,
+      "loss": 0.496,
+      "step": 530
+    },
+    {
+      "epoch": 1.2415795586527294,
+      "grad_norm": 0.9475392699241638,
+      "learning_rate": 2.043756404216233e-05,
+      "loss": 0.5214,
+      "step": 535
+    },
+    {
+      "epoch": 1.2531939605110336,
+      "grad_norm": 0.9725027084350586,
+      "learning_rate": 2.0258302318859882e-05,
+      "loss": 0.5317,
+      "step": 540
+    },
+    {
+      "epoch": 1.264808362369338,
+      "grad_norm": 1.0067145824432373,
+      "learning_rate": 2.0078180230724645e-05,
+      "loss": 0.4972,
+      "step": 545
+    },
+    {
+      "epoch": 1.2764227642276422,
+      "grad_norm": 1.0448566675186157,
+      "learning_rate": 1.9897227249380873e-05,
+      "loss": 0.5188,
+      "step": 550
+    },
+    {
+      "epoch": 1.2880371660859466,
+      "grad_norm": 0.9800556302070618,
+      "learning_rate": 1.971547298240381e-05,
+      "loss": 0.5429,
+      "step": 555
+    },
+    {
+      "epoch": 1.2996515679442509,
+      "grad_norm": 1.0093374252319336,
+      "learning_rate": 1.953294716847527e-05,
+      "loss": 0.518,
+      "step": 560
+    },
+    {
+      "epoch": 1.3112659698025553,
+      "grad_norm": 0.8734726309776306,
+      "learning_rate": 1.9349679672517778e-05,
+      "loss": 0.4876,
+      "step": 565
+    },
+    {
+      "epoch": 1.3228803716608595,
+      "grad_norm": 1.1090798377990723,
+      "learning_rate": 1.9165700480808073e-05,
+      "loss": 0.5526,
+      "step": 570
+    },
+    {
+      "epoch": 1.3344947735191637,
+      "grad_norm": 1.0246576070785522,
+      "learning_rate": 1.8981039696070744e-05,
+      "loss": 0.5061,
+      "step": 575
+    },
+    {
+      "epoch": 1.346109175377468,
+      "grad_norm": 1.0585824251174927,
+      "learning_rate": 1.879572753255282e-05,
+      "loss": 0.48,
+      "step": 580
+    },
+    {
+      "epoch": 1.3577235772357723,
+      "grad_norm": 0.9529991745948792,
+      "learning_rate": 1.8609794311080093e-05,
+      "loss": 0.5082,
+      "step": 585
+    },
+    {
+      "epoch": 1.3693379790940767,
+      "grad_norm": 0.991818368434906,
+      "learning_rate": 1.842327045409602e-05,
+      "loss": 0.4937,
+      "step": 590
+    },
+    {
+      "epoch": 1.380952380952381,
+      "grad_norm": 0.9719640016555786,
+      "learning_rate": 1.8236186480684006e-05,
+      "loss": 0.4753,
+      "step": 595
+    },
+    {
+      "epoch": 1.3925667828106851,
+      "grad_norm": 1.0128206014633179,
+      "learning_rate": 1.8048573001573856e-05,
+      "loss": 0.5529,
+      "step": 600
+    },
+    {
+      "epoch": 1.4041811846689896,
+      "grad_norm": 0.9569838047027588,
+      "learning_rate": 1.786046071413324e-05,
+      "loss": 0.4864,
+      "step": 605
+    },
+    {
+      "epoch": 1.415795586527294,
+      "grad_norm": 0.9988173842430115,
+      "learning_rate": 1.7671880397344973e-05,
+      "loss": 0.451,
+      "step": 610
+    },
+    {
+      "epoch": 1.4274099883855982,
+      "grad_norm": 1.0235010385513306,
+      "learning_rate": 1.7482862906770957e-05,
+      "loss": 0.5122,
+      "step": 615
+    },
+    {
+      "epoch": 1.4390243902439024,
+      "grad_norm": 1.152029275894165,
+      "learning_rate": 1.72934391695036e-05,
+      "loss": 0.4664,
+      "step": 620
+    },
+    {
+      "epoch": 1.4506387921022068,
+      "grad_norm": 1.1890580654144287,
+      "learning_rate": 1.710364017910549e-05,
+      "loss": 0.4704,
+      "step": 625
+    },
+    {
+      "epoch": 1.462253193960511,
+      "grad_norm": 1.0152846574783325,
+      "learning_rate": 1.6913496990538227e-05,
+      "loss": 0.5124,
+      "step": 630
+    },
+    {
+      "epoch": 1.4738675958188154,
+      "grad_norm": 1.0904698371887207,
+      "learning_rate": 1.6723040715081228e-05,
+      "loss": 0.4861,
+      "step": 635
+    },
+    {
+      "epoch": 1.4854819976771196,
+      "grad_norm": 1.0022042989730835,
+      "learning_rate": 1.6532302515241254e-05,
+      "loss": 0.4891,
+      "step": 640
+    },
+    {
+      "epoch": 1.4970963995354238,
+      "grad_norm": 0.9874012470245361,
+      "learning_rate": 1.634131359965362e-05,
+      "loss": 0.5167,
+      "step": 645
+    },
+    {
+      "epoch": 1.5087108013937283,
+      "grad_norm": 0.9787576794624329,
+      "learning_rate": 1.6150105217975794e-05,
+      "loss": 0.5184,
+      "step": 650
+    },
+    {
+      "epoch": 1.5203252032520327,
+      "grad_norm": 0.9576313495635986,
+      "learning_rate": 1.5958708655774387e-05,
+      "loss": 0.4847,
+      "step": 655
+    },
+    {
+      "epoch": 1.5319396051103369,
+      "grad_norm": 1.0970077514648438,
+      "learning_rate": 1.576715522940612e-05,
+      "loss": 0.4991,
+      "step": 660
+    },
+    {
+      "epoch": 1.543554006968641,
+      "grad_norm": 1.065991997718811,
+      "learning_rate": 1.557547628089389e-05,
+      "loss": 0.4761,
+      "step": 665
+    },
+    {
+      "epoch": 1.5551684088269453,
+      "grad_norm": 1.136781930923462,
+      "learning_rate": 1.538370317279855e-05,
+      "loss": 0.4695,
+      "step": 670
+    },
+    {
+      "epoch": 1.5667828106852497,
+      "grad_norm": 1.0636974573135376,
+      "learning_rate": 1.5191867283087384e-05,
+      "loss": 0.4477,
+      "step": 675
+    },
+    {
+      "epoch": 1.5783972125435541,
+      "grad_norm": 1.1356184482574463,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4324,
+      "step": 680
+    },
+    {
+      "epoch": 1.5900116144018583,
+      "grad_norm": 1.1115050315856934,
+      "learning_rate": 1.480813271691262e-05,
+      "loss": 0.3999,
+      "step": 685
+    },
+    {
+      "epoch": 1.6016260162601625,
+      "grad_norm": 1.0832542181015015,
+      "learning_rate": 1.4616296827201453e-05,
+      "loss": 0.4461,
+      "step": 690
+    },
+    {
+      "epoch": 1.6132404181184667,
+      "grad_norm": 1.0706605911254883,
+      "learning_rate": 1.4424523719106112e-05,
+      "loss": 0.4653,
+      "step": 695
+    },
+    {
+      "epoch": 1.6248548199767712,
+      "grad_norm": 1.0427998304367065,
+      "learning_rate": 1.4232844770593881e-05,
+      "loss": 0.4318,
+      "step": 700
+    },
+    {
+      "epoch": 1.6364692218350756,
+      "grad_norm": 1.127982258796692,
+      "learning_rate": 1.4041291344225615e-05,
+      "loss": 0.4475,
+      "step": 705
+    },
+    {
+      "epoch": 1.6480836236933798,
+      "grad_norm": 1.1579025983810425,
+      "learning_rate": 1.3849894782024207e-05,
+      "loss": 0.4715,
+      "step": 710
+    },
+    {
+      "epoch": 1.659698025551684,
+      "grad_norm": 1.0416160821914673,
+      "learning_rate": 1.3658686400346386e-05,
+      "loss": 0.46,
+      "step": 715
+    },
+    {
+      "epoch": 1.6713124274099884,
+      "grad_norm": 0.9889193177223206,
+      "learning_rate": 1.3467697484758746e-05,
+      "loss": 0.4903,
+      "step": 720
+    },
+    {
+      "epoch": 1.6829268292682928,
+      "grad_norm": 0.9814087152481079,
+      "learning_rate": 1.3276959284918774e-05,
+      "loss": 0.4239,
+      "step": 725
+    },
+    {
+      "epoch": 1.694541231126597,
+      "grad_norm": 1.0258443355560303,
+      "learning_rate": 1.3086503009461775e-05,
+      "loss": 0.4665,
+      "step": 730
+    },
+    {
+      "epoch": 1.7061556329849012,
+      "grad_norm": 1.0588767528533936,
+      "learning_rate": 1.2896359820894514e-05,
+      "loss": 0.4374,
+      "step": 735
+    },
+    {
+      "epoch": 1.7177700348432055,
+      "grad_norm": 1.1612567901611328,
+      "learning_rate": 1.2706560830496401e-05,
+      "loss": 0.4354,
+      "step": 740
+    },
+    {
+      "epoch": 1.7293844367015099,
+      "grad_norm": 1.0602537393569946,
+      "learning_rate": 1.2517137093229043e-05,
+      "loss": 0.3714,
+      "step": 745
+    },
+    {
+      "epoch": 1.7409988385598143,
+      "grad_norm": 1.03526771068573,
+      "learning_rate": 1.2328119602655031e-05,
+      "loss": 0.4183,
+      "step": 750
+    },
+    {
+      "epoch": 1.7526132404181185,
+      "grad_norm": 1.3858877420425415,
+      "learning_rate": 1.2139539285866758e-05,
+      "loss": 0.4428,
+      "step": 755
+    },
+    {
+      "epoch": 1.7642276422764227,
+      "grad_norm": 1.0921659469604492,
+      "learning_rate": 1.1951426998426143e-05,
+      "loss": 0.4248,
+      "step": 760
+    },
+    {
+      "epoch": 1.775842044134727,
+      "grad_norm": 1.0866457223892212,
+      "learning_rate": 1.1763813519315994e-05,
+      "loss": 0.4604,
+      "step": 765
+    },
+    {
+      "epoch": 1.7874564459930313,
+      "grad_norm": 1.1888844966888428,
+      "learning_rate": 1.1576729545903983e-05,
+      "loss": 0.4036,
+      "step": 770
+    },
+    {
+      "epoch": 1.7990708478513358,
+      "grad_norm": 1.0188145637512207,
+      "learning_rate": 1.1390205688919908e-05,
+      "loss": 0.3986,
+      "step": 775
+    },
+    {
+      "epoch": 1.81068524970964,
+      "grad_norm": 1.1359866857528687,
+      "learning_rate": 1.1204272467447177e-05,
+      "loss": 0.428,
+      "step": 780
+    },
+    {
+      "epoch": 1.8222996515679442,
+      "grad_norm": 1.0662692785263062,
+      "learning_rate": 1.1018960303929253e-05,
+      "loss": 0.4188,
+      "step": 785
+    },
+    {
+      "epoch": 1.8339140534262486,
+      "grad_norm": 1.0792381763458252,
+      "learning_rate": 1.0834299519191928e-05,
+      "loss": 0.3942,
+      "step": 790
+    },
+    {
+      "epoch": 1.845528455284553,
+      "grad_norm": 1.0623815059661865,
+      "learning_rate": 1.0650320327482223e-05,
+      "loss": 0.3978,
+      "step": 795
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 1.1844643354415894,
+      "learning_rate": 1.046705283152473e-05,
+      "loss": 0.4163,
+      "step": 800
+    },
+    {
+      "epoch": 1.8687572590011614,
+      "grad_norm": 1.0871316194534302,
+      "learning_rate": 1.028452701759619e-05,
+      "loss": 0.436,
+      "step": 805
+    },
+    {
+      "epoch": 1.8803716608594656,
+      "grad_norm": 1.1240311861038208,
+      "learning_rate": 1.010277275061913e-05,
+      "loss": 0.4137,
+      "step": 810
+    },
+    {
+      "epoch": 1.89198606271777,
+      "grad_norm": 1.019579291343689,
+      "learning_rate": 9.921819769275356e-06,
+      "loss": 0.338,
+      "step": 815
+    },
+    {
+      "epoch": 1.9036004645760745,
+      "grad_norm": 1.1016135215759277,
+      "learning_rate": 9.741697681140113e-06,
+      "loss": 0.3778,
+      "step": 820
+    },
+    {
+      "epoch": 1.9152148664343787,
+      "grad_norm": 1.0139800310134888,
+      "learning_rate": 9.562435957837673e-06,
+      "loss": 0.3907,
+      "step": 825
+    },
+    {
+      "epoch": 1.9268292682926829,
+      "grad_norm": 1.1818102598190308,
+      "learning_rate": 9.384063930219178e-06,
+      "loss": 0.3602,
+      "step": 830
+    },
+    {
+      "epoch": 1.938443670150987,
+      "grad_norm": 0.9864258170127869,
+      "learning_rate": 9.20661078356353e-06,
+      "loss": 0.343,
+      "step": 835
+    },
+    {
+      "epoch": 1.9500580720092915,
+      "grad_norm": 1.038926601409912,
+      "learning_rate": 9.030105552802077e-06,
+      "loss": 0.39,
+      "step": 840
+    },
+    {
+      "epoch": 1.961672473867596,
+      "grad_norm": 0.9714798927307129,
+      "learning_rate": 8.854577117767922e-06,
+      "loss": 0.3602,
+      "step": 845
+    },
+    {
+      "epoch": 1.9732868757259001,
+      "grad_norm": 1.0614789724349976,
+      "learning_rate": 8.680054198470581e-06,
+      "loss": 0.3859,
+      "step": 850
+    },
+    {
+      "epoch": 1.9849012775842043,
+      "grad_norm": 1.1027911901474,
+      "learning_rate": 8.506565350396824e-06,
+      "loss": 0.4069,
+      "step": 855
+    },
+    {
+      "epoch": 1.9965156794425087,
+      "grad_norm": 1.1654589176177979,
+      "learning_rate": 8.334138959838404e-06,
+      "loss": 0.3802,
+      "step": 860
+    },
+    {
+      "epoch": 2.0069686411149825,
+      "grad_norm": 0.8810543417930603,
+      "learning_rate": 8.162803239247503e-06,
+      "loss": 0.3578,
+      "step": 865
+    },
+    {
+      "epoch": 2.0185830429732867,
+      "grad_norm": 1.1703990697860718,
+      "learning_rate": 7.99258622262057e-06,
+      "loss": 0.3419,
+      "step": 870
+    },
+    {
+      "epoch": 2.0301974448315914,
+      "grad_norm": 1.094256043434143,
+      "learning_rate": 7.823515760911436e-06,
+      "loss": 0.3105,
+      "step": 875
+    },
+    {
+      "epoch": 2.0418118466898956,
+      "grad_norm": 1.0097821950912476,
+      "learning_rate": 7.655619517474288e-06,
+      "loss": 0.3196,
+      "step": 880
+    },
+    {
+      "epoch": 2.0534262485481998,
+      "grad_norm": 1.013162612915039,
+      "learning_rate": 7.488924963537418e-06,
+      "loss": 0.3148,
+      "step": 885
+    },
+    {
+      "epoch": 2.065040650406504,
+      "grad_norm": 1.051941156387329,
+      "learning_rate": 7.323459373708364e-06,
+      "loss": 0.3155,
+      "step": 890
+    },
+    {
+      "epoch": 2.076655052264808,
+      "grad_norm": 1.172757863998413,
+      "learning_rate": 7.1592498215112075e-06,
+      "loss": 0.3207,
+      "step": 895
+    },
+    {
+      "epoch": 2.088269454123113,
+      "grad_norm": 1.209959626197815,
+      "learning_rate": 6.996323174956836e-06,
+      "loss": 0.3182,
+      "step": 900
+    },
+    {
+      "epoch": 2.099883855981417,
+      "grad_norm": 0.9459628462791443,
+      "learning_rate": 6.8347060921467735e-06,
+      "loss": 0.314,
+      "step": 905
+    },
+    {
+      "epoch": 2.1114982578397212,
+      "grad_norm": 0.980540931224823,
+      "learning_rate": 6.674425016911355e-06,
+      "loss": 0.2989,
+      "step": 910
+    },
+    {
+      "epoch": 2.1231126596980254,
+      "grad_norm": 1.1647413969039917,
+      "learning_rate": 6.515506174483018e-06,
+      "loss": 0.2964,
+      "step": 915
+    },
+    {
+      "epoch": 2.1347270615563296,
+      "grad_norm": 1.052589774131775,
+      "learning_rate": 6.3579755672052885e-06,
+      "loss": 0.3124,
+      "step": 920
+    },
+    {
+      "epoch": 2.1463414634146343,
+      "grad_norm": 1.0232487916946411,
+      "learning_rate": 6.201858970278294e-06,
+      "loss": 0.358,
+      "step": 925
+    },
+    {
+      "epoch": 2.1579558652729385,
+      "grad_norm": 1.2013530731201172,
+      "learning_rate": 6.047181927541417e-06,
+      "loss": 0.3343,
+      "step": 930
+    },
+    {
+      "epoch": 2.1695702671312427,
+      "grad_norm": 0.9949542880058289,
+      "learning_rate": 5.893969747293777e-06,
+      "loss": 0.3133,
+      "step": 935
+    },
+    {
+      "epoch": 2.181184668989547,
+      "grad_norm": 1.1188600063323975,
+      "learning_rate": 5.742247498153319e-06,
+      "loss": 0.2977,
+      "step": 940
+    },
+    {
+      "epoch": 2.1927990708478515,
+      "grad_norm": 1.1514487266540527,
+      "learning_rate": 5.592040004955061e-06,
+      "loss": 0.326,
+      "step": 945
+    },
+    {
+      "epoch": 2.2044134727061557,
+      "grad_norm": 1.2033545970916748,
+      "learning_rate": 5.4433718446892334e-06,
+      "loss": 0.2963,
+      "step": 950
+    },
+    {
+      "epoch": 2.21602787456446,
+      "grad_norm": 1.0538955926895142,
+      "learning_rate": 5.29626734248002e-06,
+      "loss": 0.2924,
+      "step": 955
+    },
+    {
+      "epoch": 2.227642276422764,
+      "grad_norm": 1.198915719985962,
+      "learning_rate": 5.150750567605435e-06,
+      "loss": 0.3074,
+      "step": 960
+    },
+    {
+      "epoch": 2.2392566782810683,
+      "grad_norm": 1.136529564857483,
+      "learning_rate": 5.0068453295591346e-06,
+      "loss": 0.3147,
+      "step": 965
+    },
+    {
+      "epoch": 2.250871080139373,
+      "grad_norm": 1.2167537212371826,
+      "learning_rate": 4.864575174154692e-06,
+      "loss": 0.2943,
+      "step": 970
+    },
+    {
+      "epoch": 2.262485481997677,
+      "grad_norm": 1.0729284286499023,
+      "learning_rate": 4.723963379673002e-06,
+      "loss": 0.291,
+      "step": 975
+    },
+    {
+      "epoch": 2.2740998838559814,
+      "grad_norm": 1.017930030822754,
+      "learning_rate": 4.585032953053515e-06,
+      "loss": 0.3062,
+      "step": 980
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 1.1780880689620972,
+      "learning_rate": 4.447806626129828e-06,
+      "loss": 0.2673,
+      "step": 985
+    },
+    {
+      "epoch": 2.2973286875725902,
+      "grad_norm": 1.2414166927337646,
+      "learning_rate": 4.31230685191027e-06,
+      "loss": 0.3006,
+      "step": 990
+    },
+    {
+      "epoch": 2.3089430894308944,
+      "grad_norm": 1.160487174987793,
+      "learning_rate": 4.178555800904174e-06,
+      "loss": 0.3004,
+      "step": 995
+    },
+    {
+      "epoch": 2.3205574912891986,
+      "grad_norm": 1.17868173122406,
+      "learning_rate": 4.0465753574942935e-06,
+      "loss": 0.2868,
+      "step": 1000
+    },
+    {
+      "epoch": 2.332171893147503,
+      "grad_norm": 1.1315648555755615,
+      "learning_rate": 3.916387116356113e-06,
+      "loss": 0.2965,
+      "step": 1005
+    },
+    {
+      "epoch": 2.343786295005807,
+      "grad_norm": 1.1295381784439087,
+      "learning_rate": 3.788012378924493e-06,
+      "loss": 0.2668,
+      "step": 1010
+    },
+    {
+      "epoch": 2.3554006968641117,
+      "grad_norm": 1.0476168394088745,
+      "learning_rate": 3.6614721499083235e-06,
+      "loss": 0.2998,
+      "step": 1015
+    },
+    {
+      "epoch": 2.367015098722416,
+      "grad_norm": 1.119430661201477,
+      "learning_rate": 3.5367871338537503e-06,
+      "loss": 0.2722,
+      "step": 1020
+    },
+    {
+      "epoch": 2.37862950058072,
+      "grad_norm": 1.212213397026062,
+      "learning_rate": 3.4139777317564763e-06,
+      "loss": 0.2914,
+      "step": 1025
+    },
+    {
+      "epoch": 2.3902439024390243,
+      "grad_norm": 1.0197581052780151,
+      "learning_rate": 3.2930640377237343e-06,
+      "loss": 0.2821,
+      "step": 1030
+    },
+    {
+      "epoch": 2.4018583042973285,
+      "grad_norm": 1.0621126890182495,
+      "learning_rate": 3.174065835686511e-06,
+      "loss": 0.3137,
+      "step": 1035
+    },
+    {
+      "epoch": 2.413472706155633,
+      "grad_norm": 1.1943347454071045,
+      "learning_rate": 3.0570025961624666e-06,
+      "loss": 0.3059,
+      "step": 1040
+    },
+    {
+      "epoch": 2.4250871080139373,
+      "grad_norm": 1.1790246963500977,
+      "learning_rate": 2.9418934730701903e-06,
+      "loss": 0.2983,
+      "step": 1045
+    },
+    {
+      "epoch": 2.4367015098722415,
+      "grad_norm": 1.0878653526306152,
+      "learning_rate": 2.8287573005952118e-06,
+      "loss": 0.2866,
+      "step": 1050
+    },
+    {
+      "epoch": 2.4483159117305457,
+      "grad_norm": 1.1005710363388062,
+      "learning_rate": 2.717612590108347e-06,
+      "loss": 0.3107,
+      "step": 1055
+    },
+    {
+      "epoch": 2.45993031358885,
+      "grad_norm": 1.1504753828048706,
+      "learning_rate": 2.6084775271368805e-06,
+      "loss": 0.2875,
+      "step": 1060
+    },
+    {
+      "epoch": 2.4715447154471546,
+      "grad_norm": 1.1118510961532593,
+      "learning_rate": 2.501369968389019e-06,
+      "loss": 0.2843,
+      "step": 1065
+    },
+    {
+      "epoch": 2.483159117305459,
+      "grad_norm": 1.0834742784500122,
+      "learning_rate": 2.396307438832195e-06,
+      "loss": 0.2993,
+      "step": 1070
+    },
+    {
+      "epoch": 2.494773519163763,
+      "grad_norm": 1.2306410074234009,
+      "learning_rate": 2.2933071288256193e-06,
+      "loss": 0.3025,
+      "step": 1075
+    },
+    {
+      "epoch": 2.506387921022067,
+      "grad_norm": 0.9859291315078735,
+      "learning_rate": 2.1923858913075735e-06,
+      "loss": 0.2913,
+      "step": 1080
+    },
+    {
+      "epoch": 2.5180023228803714,
+      "grad_norm": 1.0358084440231323,
+      "learning_rate": 2.093560239037959e-06,
+      "loss": 0.2918,
+      "step": 1085
+    },
+    {
+      "epoch": 2.529616724738676,
+      "grad_norm": 1.0371265411376953,
+      "learning_rate": 1.996846341896452e-06,
+      "loss": 0.2764,
+      "step": 1090
+    },
+    {
+      "epoch": 2.5412311265969802,
+      "grad_norm": 1.1168688535690308,
+      "learning_rate": 1.9022600242367843e-06,
+      "loss": 0.3061,
+      "step": 1095
+    },
+    {
+      "epoch": 2.5528455284552845,
+      "grad_norm": 1.000887393951416,
+      "learning_rate": 1.8098167622975693e-06,
+      "loss": 0.3254,
+      "step": 1100
+    },
+    {
+      "epoch": 2.564459930313589,
+      "grad_norm": 1.1483852863311768,
+      "learning_rate": 1.7195316816700662e-06,
+      "loss": 0.2878,
+      "step": 1105
+    },
+    {
+      "epoch": 2.5760743321718933,
+      "grad_norm": 1.1495416164398193,
+      "learning_rate": 1.6314195548233319e-06,
+      "loss": 0.284,
+      "step": 1110
+    },
+    {
+      "epoch": 2.5876887340301975,
+      "grad_norm": 1.0184448957443237,
+      "learning_rate": 1.5454947986871509e-06,
+      "loss": 0.2905,
+      "step": 1115
+    },
+    {
+      "epoch": 2.5993031358885017,
+      "grad_norm": 1.090733528137207,
+      "learning_rate": 1.4617714722931109e-06,
+      "loss": 0.2635,
+      "step": 1120
+    },
+    {
+      "epoch": 2.610917537746806,
+      "grad_norm": 0.9690793752670288,
+      "learning_rate": 1.3802632744742878e-06,
+      "loss": 0.3299,
+      "step": 1125
+    },
+    {
+      "epoch": 2.6225319396051106,
+      "grad_norm": 1.1170541048049927,
+      "learning_rate": 1.3009835416238296e-06,
+      "loss": 0.2845,
+      "step": 1130
+    },
+    {
+      "epoch": 2.6341463414634148,
+      "grad_norm": 1.1148786544799805,
+      "learning_rate": 1.2239452455128392e-06,
+      "loss": 0.3209,
+      "step": 1135
+    },
+    {
+      "epoch": 2.645760743321719,
+      "grad_norm": 1.1871299743652344,
+      "learning_rate": 1.1491609911679484e-06,
+      "loss": 0.2956,
+      "step": 1140
+    },
+    {
+      "epoch": 2.657375145180023,
+      "grad_norm": 1.153961181640625,
+      "learning_rate": 1.0766430148088686e-06,
+      "loss": 0.2666,
+      "step": 1145
+    },
+    {
+      "epoch": 2.6689895470383274,
+      "grad_norm": 1.2079453468322754,
+      "learning_rate": 1.0064031818462982e-06,
+      "loss": 0.2883,
+      "step": 1150
+    },
+    {
+      "epoch": 2.680603948896632,
+      "grad_norm": 1.0874303579330444,
+      "learning_rate": 9.384529849405077e-07,
+      "loss": 0.2431,
+      "step": 1155
+    },
+    {
+      "epoch": 2.692218350754936,
+      "grad_norm": 1.0171637535095215,
+      "learning_rate": 8.728035421209058e-07,
+      "loss": 0.2774,
+      "step": 1160
+    },
+    {
+      "epoch": 2.7038327526132404,
+      "grad_norm": 0.9877467155456543,
+      "learning_rate": 8.094655949668917e-07,
+      "loss": 0.2673,
+      "step": 1165
+    },
+    {
+      "epoch": 2.7154471544715446,
+      "grad_norm": 1.1348854303359985,
+      "learning_rate": 7.484495068503361e-07,
+      "loss": 0.2752,
+      "step": 1170
+    },
+    {
+      "epoch": 2.727061556329849,
+      "grad_norm": 1.0911859273910522,
+      "learning_rate": 6.897652612399024e-07,
+      "loss": 0.2725,
+      "step": 1175
+    },
+    {
+      "epoch": 2.7386759581881535,
+      "grad_norm": 1.1613843441009521,
+      "learning_rate": 6.334224600675687e-07,
+      "loss": 0.2817,
+      "step": 1180
+    },
+    {
+      "epoch": 2.7502903600464577,
+      "grad_norm": 1.0359234809875488,
+      "learning_rate": 5.794303221575437e-07,
+      "loss": 0.2707,
+      "step": 1185
+    },
+    {
+      "epoch": 2.761904761904762,
+      "grad_norm": 1.0068219900131226,
+      "learning_rate": 5.277976817178793e-07,
+      "loss": 0.2909,
+      "step": 1190
+    },
+    {
+      "epoch": 2.773519163763066,
+      "grad_norm": 0.9488641023635864,
+      "learning_rate": 4.785329868950278e-07,
+      "loss": 0.284,
+      "step": 1195
+    },
+    {
+      "epoch": 2.7851335656213703,
+      "grad_norm": 1.1597959995269775,
+      "learning_rate": 4.316442983915364e-07,
+      "loss": 0.2874,
+      "step": 1200
+    },
+    {
+      "epoch": 2.796747967479675,
+      "grad_norm": 1.0681912899017334,
+      "learning_rate": 3.871392881471736e-07,
+      "loss": 0.2937,
+      "step": 1205
+    },
+    {
+      "epoch": 2.808362369337979,
+      "grad_norm": 0.9509957432746887,
+      "learning_rate": 3.4502523808362497e-07,
+      "loss": 0.2643,
+      "step": 1210
+    },
+    {
+      "epoch": 2.8199767711962833,
+      "grad_norm": 1.2249250411987305,
+      "learning_rate": 3.053090389130314e-07,
+      "loss": 0.2851,
+      "step": 1215
+    },
+    {
+      "epoch": 2.831591173054588,
+      "grad_norm": 1.1051902770996094,
+      "learning_rate": 2.6799718901053596e-07,
+      "loss": 0.2843,
+      "step": 1220
+    },
+    {
+      "epoch": 2.8432055749128917,
+      "grad_norm": 1.2920292615890503,
+      "learning_rate": 2.3309579335100251e-07,
+      "loss": 0.3155,
+      "step": 1225
+    },
+    {
+      "epoch": 2.8548199767711964,
+      "grad_norm": 1.1401209831237793,
+      "learning_rate": 2.0061056251013175e-07,
+      "loss": 0.2531,
+      "step": 1230
+    },
+    {
+      "epoch": 2.8664343786295006,
+      "grad_norm": 1.1830666065216064,
+      "learning_rate": 1.7054681173009012e-07,
+      "loss": 0.2571,
+      "step": 1235
+    },
+    {
+      "epoch": 2.8780487804878048,
+      "grad_norm": 1.0032836198806763,
+      "learning_rate": 1.4290946004982375e-07,
+      "loss": 0.2523,
+      "step": 1240
+    },
+    {
+      "epoch": 2.8896631823461094,
+      "grad_norm": 0.9659789204597473,
+      "learning_rate": 1.1770302950021239e-07,
+      "loss": 0.2761,
+      "step": 1245
+    },
+    {
+      "epoch": 2.9012775842044136,
+      "grad_norm": 1.0413161516189575,
+      "learning_rate": 9.493164436417124e-08,
+      "loss": 0.3343,
+      "step": 1250
+    },
+    {
+      "epoch": 2.912891986062718,
+      "grad_norm": 0.995083212852478,
+      "learning_rate": 7.459903050182903e-08,
+      "loss": 0.2817,
+      "step": 1255
+    },
+    {
+      "epoch": 2.924506387921022,
+      "grad_norm": 1.0436125993728638,
+      "learning_rate": 5.6708514740907434e-08,
+      "loss": 0.2674,
+      "step": 1260
+    },
+    {
+      "epoch": 2.9361207897793262,
+      "grad_norm": 1.1036518812179565,
+      "learning_rate": 4.126302433238638e-08,
+      "loss": 0.3021,
+      "step": 1265
+    },
+    {
+      "epoch": 2.947735191637631,
+      "grad_norm": 1.0590194463729858,
+      "learning_rate": 2.826508647153725e-08,
+      "loss": 0.2548,
+      "step": 1270
+    },
+    {
+      "epoch": 2.959349593495935,
+      "grad_norm": 1.1428706645965576,
+      "learning_rate": 1.7716827884435295e-08,
+      "loss": 0.2704,
+      "step": 1275
+    },
+    {
+      "epoch": 2.9709639953542393,
+      "grad_norm": 1.0792584419250488,
+      "learning_rate": 9.61997447996965e-09,
+      "loss": 0.2867,
+      "step": 1280
+    },
+    {
+      "epoch": 2.9825783972125435,
+      "grad_norm": 1.0899779796600342,
+      "learning_rate": 3.975851067464231e-09,
+      "loss": 0.2644,
+      "step": 1285
+    },
+    {
+      "epoch": 2.9941927990708477,
+      "grad_norm": 1.174023151397705,
+      "learning_rate": 7.853811399027854e-10,
+      "loss": 0.2955,
+      "step": 1290
+    },
+    {
+      "epoch": 3.0,
+      "step": 1293,
+      "total_flos": 2.0359870555944387e+18,
+      "train_loss": 0.5806684772450151,
+      "train_runtime": 1005.3243,
+      "train_samples_per_second": 41.1,
+      "train_steps_per_second": 1.286
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1293,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0359870555944387e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

25_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:698efa641c4b78c546e5f7c4fc4ef0bc2e87aae8ac996cd858aca4ba2b4e39d6
+size 8209

25_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff