Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

123_128_e3_3e-5/adapter_config.json +39 -0
123_128_e3_3e-5/adapter_model.safetensors +3 -0
123_128_e3_3e-5/added_tokens.json +9 -0
123_128_e3_3e-5/all_results.json +9 -0
123_128_e3_3e-5/chat_template.jinja +62 -0
123_128_e3_3e-5/config.json +32 -0
123_128_e3_3e-5/merges.txt +0 -0
123_128_e3_3e-5/special_tokens_map.json +33 -0
123_128_e3_3e-5/tokenizer.json +0 -0
123_128_e3_3e-5/tokenizer_config.json +234 -0
123_128_e3_3e-5/train_results.json +9 -0
123_128_e3_3e-5/trainer_state.json +1562 -0
123_128_e3_3e-5/training_args.bin +3 -0
123_128_e3_3e-5/vocab.json +0 -0

123_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

123_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce747423e776117de18d0c7302258a4ceade889081fc441eccff175d40bfe43f
+size 791751704

123_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

123_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5435133944793661e+18,
+    "train_loss": 0.5637503340224589,
+    "train_runtime": 727.9191,
+    "train_samples": 11598,
+    "train_samples_per_second": 47.799,
+    "train_steps_per_second": 1.496
+}

123_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

123_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

123_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

123_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

123_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

123_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

123_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5435133944793661e+18,
+    "train_loss": 0.5637503340224589,
+    "train_runtime": 727.9191,
+    "train_samples": 11598,
+    "train_samples_per_second": 47.799,
+    "train_steps_per_second": 1.496
+}

123_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1562 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1089,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013793103448275862,
+      "grad_norm": 3.789069414138794,
+      "learning_rate": 2.181818181818182e-06,
+      "loss": 1.5156,
+      "step": 5
+    },
+    {
+      "epoch": 0.027586206896551724,
+      "grad_norm": 2.1134283542633057,
+      "learning_rate": 4.90909090909091e-06,
+      "loss": 1.551,
+      "step": 10
+    },
+    {
+      "epoch": 0.041379310344827586,
+      "grad_norm": 0.8664379119873047,
+      "learning_rate": 7.636363636363636e-06,
+      "loss": 1.3763,
+      "step": 15
+    },
+    {
+      "epoch": 0.05517241379310345,
+      "grad_norm": 0.4911561906337738,
+      "learning_rate": 1.0363636363636364e-05,
+      "loss": 1.3496,
+      "step": 20
+    },
+    {
+      "epoch": 0.06896551724137931,
+      "grad_norm": 0.4065496623516083,
+      "learning_rate": 1.309090909090909e-05,
+      "loss": 1.3766,
+      "step": 25
+    },
+    {
+      "epoch": 0.08275862068965517,
+      "grad_norm": 0.3920210301876068,
+      "learning_rate": 1.5818181818181818e-05,
+      "loss": 1.2896,
+      "step": 30
+    },
+    {
+      "epoch": 0.09655172413793103,
+      "grad_norm": 0.393510639667511,
+      "learning_rate": 1.8545454545454545e-05,
+      "loss": 1.3081,
+      "step": 35
+    },
+    {
+      "epoch": 0.1103448275862069,
+      "grad_norm": 0.438605934381485,
+      "learning_rate": 2.1272727272727273e-05,
+      "loss": 1.2669,
+      "step": 40
+    },
+    {
+      "epoch": 0.12413793103448276,
+      "grad_norm": 0.4716624915599823,
+      "learning_rate": 2.4e-05,
+      "loss": 1.2727,
+      "step": 45
+    },
+    {
+      "epoch": 0.13793103448275862,
+      "grad_norm": 0.35739514231681824,
+      "learning_rate": 2.6727272727272728e-05,
+      "loss": 1.2108,
+      "step": 50
+    },
+    {
+      "epoch": 0.15172413793103448,
+      "grad_norm": 0.4633024334907532,
+      "learning_rate": 2.9454545454545456e-05,
+      "loss": 1.1722,
+      "step": 55
+    },
+    {
+      "epoch": 0.16551724137931034,
+      "grad_norm": 0.37168559432029724,
+      "learning_rate": 2.9998892268339834e-05,
+      "loss": 1.1924,
+      "step": 60
+    },
+    {
+      "epoch": 0.1793103448275862,
+      "grad_norm": 0.3861393630504608,
+      "learning_rate": 2.999439238887347e-05,
+      "loss": 1.1988,
+      "step": 65
+    },
+    {
+      "epoch": 0.19310344827586207,
+      "grad_norm": 0.391494482755661,
+      "learning_rate": 2.998643216603669e-05,
+      "loss": 1.192,
+      "step": 70
+    },
+    {
+      "epoch": 0.20689655172413793,
+      "grad_norm": 0.41428250074386597,
+      "learning_rate": 2.9975013436856537e-05,
+      "loss": 1.1524,
+      "step": 75
+    },
+    {
+      "epoch": 0.2206896551724138,
+      "grad_norm": 0.38968023657798767,
+      "learning_rate": 2.996013883649973e-05,
+      "loss": 1.125,
+      "step": 80
+    },
+    {
+      "epoch": 0.23448275862068965,
+      "grad_norm": 0.4461952745914459,
+      "learning_rate": 2.9941811797664532e-05,
+      "loss": 1.187,
+      "step": 85
+    },
+    {
+      "epoch": 0.2482758620689655,
+      "grad_norm": 0.4274137616157532,
+      "learning_rate": 2.992003654978857e-05,
+      "loss": 1.1226,
+      "step": 90
+    },
+    {
+      "epoch": 0.2620689655172414,
+      "grad_norm": 0.4409297704696655,
+      "learning_rate": 2.989481811807278e-05,
+      "loss": 1.1529,
+      "step": 95
+    },
+    {
+      "epoch": 0.27586206896551724,
+      "grad_norm": 0.47845369577407837,
+      "learning_rate": 2.9866162322321703e-05,
+      "loss": 1.0957,
+      "step": 100
+    },
+    {
+      "epoch": 0.2896551724137931,
+      "grad_norm": 0.42301008105278015,
+      "learning_rate": 2.9834075775600434e-05,
+      "loss": 1.064,
+      "step": 105
+    },
+    {
+      "epoch": 0.30344827586206896,
+      "grad_norm": 0.43675681948661804,
+      "learning_rate": 2.979856588270846e-05,
+      "loss": 1.1521,
+      "step": 110
+    },
+    {
+      "epoch": 0.31724137931034485,
+      "grad_norm": 0.4185957908630371,
+      "learning_rate": 2.9759640838470855e-05,
+      "loss": 1.0869,
+      "step": 115
+    },
+    {
+      "epoch": 0.3310344827586207,
+      "grad_norm": 0.44748929142951965,
+      "learning_rate": 2.9717309625847053e-05,
+      "loss": 1.0377,
+      "step": 120
+    },
+    {
+      "epoch": 0.3448275862068966,
+      "grad_norm": 0.5275693535804749,
+      "learning_rate": 2.9671582013857852e-05,
+      "loss": 1.0273,
+      "step": 125
+    },
+    {
+      "epoch": 0.3586206896551724,
+      "grad_norm": 0.4910759925842285,
+      "learning_rate": 2.9622468555330916e-05,
+      "loss": 1.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.3724137931034483,
+      "grad_norm": 0.5396916270256042,
+      "learning_rate": 2.9569980584465485e-05,
+      "loss": 1.0143,
+      "step": 135
+    },
+    {
+      "epoch": 0.38620689655172413,
+      "grad_norm": 0.5080529451370239,
+      "learning_rate": 2.9514130214216667e-05,
+      "loss": 0.9708,
+      "step": 140
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5421094298362732,
+      "learning_rate": 2.9454930333500094e-05,
+      "loss": 1.0351,
+      "step": 145
+    },
+    {
+      "epoch": 0.41379310344827586,
+      "grad_norm": 0.580910861492157,
+      "learning_rate": 2.939239460421746e-05,
+      "loss": 1.0137,
+      "step": 150
+    },
+    {
+      "epoch": 0.42758620689655175,
+      "grad_norm": 0.5520555973052979,
+      "learning_rate": 2.9326537458103687e-05,
+      "loss": 1.0253,
+      "step": 155
+    },
+    {
+      "epoch": 0.4413793103448276,
+      "grad_norm": 0.6470196843147278,
+      "learning_rate": 2.9257374093396422e-05,
+      "loss": 0.9606,
+      "step": 160
+    },
+    {
+      "epoch": 0.45517241379310347,
+      "grad_norm": 0.5860880017280579,
+      "learning_rate": 2.918492047132866e-05,
+      "loss": 0.9201,
+      "step": 165
+    },
+    {
+      "epoch": 0.4689655172413793,
+      "grad_norm": 0.5931556820869446,
+      "learning_rate": 2.9109193312445277e-05,
+      "loss": 0.9308,
+      "step": 170
+    },
+    {
+      "epoch": 0.4827586206896552,
+      "grad_norm": 0.5834107995033264,
+      "learning_rate": 2.9030210092744324e-05,
+      "loss": 0.8885,
+      "step": 175
+    },
+    {
+      "epoch": 0.496551724137931,
+      "grad_norm": 0.6210553050041199,
+      "learning_rate": 2.8947989039644e-05,
+      "loss": 0.9306,
+      "step": 180
+    },
+    {
+      "epoch": 0.5103448275862069,
+      "grad_norm": 0.6053234338760376,
+      "learning_rate": 2.886254912777619e-05,
+      "loss": 0.9501,
+      "step": 185
+    },
+    {
+      "epoch": 0.5241379310344828,
+      "grad_norm": 0.615418553352356,
+      "learning_rate": 2.8773910074607604e-05,
+      "loss": 0.8837,
+      "step": 190
+    },
+    {
+      "epoch": 0.5379310344827586,
+      "grad_norm": 0.6646992564201355,
+      "learning_rate": 2.868209233588942e-05,
+      "loss": 0.9339,
+      "step": 195
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.6801667809486389,
+      "learning_rate": 2.8587117100936643e-05,
+      "loss": 0.8631,
+      "step": 200
+    },
+    {
+      "epoch": 0.5655172413793104,
+      "grad_norm": 0.7012519836425781,
+      "learning_rate": 2.848900628773808e-05,
+      "loss": 0.895,
+      "step": 205
+    },
+    {
+      "epoch": 0.5793103448275863,
+      "grad_norm": 0.6613981127738953,
+      "learning_rate": 2.838778253789822e-05,
+      "loss": 0.8927,
+      "step": 210
+    },
+    {
+      "epoch": 0.593103448275862,
+      "grad_norm": 0.713719367980957,
+      "learning_rate": 2.8283469211412095e-05,
+      "loss": 0.8145,
+      "step": 215
+    },
+    {
+      "epoch": 0.6068965517241379,
+      "grad_norm": 0.6679086089134216,
+      "learning_rate": 2.8176090381274353e-05,
+      "loss": 0.8243,
+      "step": 220
+    },
+    {
+      "epoch": 0.6206896551724138,
+      "grad_norm": 0.6847007870674133,
+      "learning_rate": 2.80656708279238e-05,
+      "loss": 0.8398,
+      "step": 225
+    },
+    {
+      "epoch": 0.6344827586206897,
+      "grad_norm": 0.7790941596031189,
+      "learning_rate": 2.7952236033524658e-05,
+      "loss": 0.8294,
+      "step": 230
+    },
+    {
+      "epoch": 0.6482758620689655,
+      "grad_norm": 0.8283791542053223,
+      "learning_rate": 2.7835812176085937e-05,
+      "loss": 0.8102,
+      "step": 235
+    },
+    {
+      "epoch": 0.6620689655172414,
+      "grad_norm": 0.7615893483161926,
+      "learning_rate": 2.7716426123420116e-05,
+      "loss": 0.8524,
+      "step": 240
+    },
+    {
+      "epoch": 0.6758620689655173,
+      "grad_norm": 0.8538187742233276,
+      "learning_rate": 2.7594105426942774e-05,
+      "loss": 0.8682,
+      "step": 245
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 0.8054585456848145,
+      "learning_rate": 2.7468878315314337e-05,
+      "loss": 0.8117,
+      "step": 250
+    },
+    {
+      "epoch": 0.7034482758620689,
+      "grad_norm": 0.7562645077705383,
+      "learning_rate": 2.7340773687925615e-05,
+      "loss": 0.7848,
+      "step": 255
+    },
+    {
+      "epoch": 0.7172413793103448,
+      "grad_norm": 0.8034841418266296,
+      "learning_rate": 2.72098211082285e-05,
+      "loss": 0.8063,
+      "step": 260
+    },
+    {
+      "epoch": 0.7310344827586207,
+      "grad_norm": 0.7713001370429993,
+      "learning_rate": 2.7076050796913445e-05,
+      "loss": 0.7708,
+      "step": 265
+    },
+    {
+      "epoch": 0.7448275862068966,
+      "grad_norm": 0.7349215745925903,
+      "learning_rate": 2.6939493624935273e-05,
+      "loss": 0.7648,
+      "step": 270
+    },
+    {
+      "epoch": 0.7586206896551724,
+      "grad_norm": 0.8297277688980103,
+      "learning_rate": 2.6800181106388882e-05,
+      "loss": 0.7709,
+      "step": 275
+    },
+    {
+      "epoch": 0.7724137931034483,
+      "grad_norm": 0.8245553970336914,
+      "learning_rate": 2.665814539123657e-05,
+      "loss": 0.7516,
+      "step": 280
+    },
+    {
+      "epoch": 0.7862068965517242,
+      "grad_norm": 0.8287628293037415,
+      "learning_rate": 2.6513419257888598e-05,
+      "loss": 0.7406,
+      "step": 285
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7908574938774109,
+      "learning_rate": 2.636603610563872e-05,
+      "loss": 0.7457,
+      "step": 290
+    },
+    {
+      "epoch": 0.8137931034482758,
+      "grad_norm": 0.8348268866539001,
+      "learning_rate": 2.6216029946956425e-05,
+      "loss": 0.7558,
+      "step": 295
+    },
+    {
+      "epoch": 0.8275862068965517,
+      "grad_norm": 0.8399357199668884,
+      "learning_rate": 2.6063435399637718e-05,
+      "loss": 0.7692,
+      "step": 300
+    },
+    {
+      "epoch": 0.8413793103448276,
+      "grad_norm": 0.8294626474380493,
+      "learning_rate": 2.590828767881612e-05,
+      "loss": 0.7554,
+      "step": 305
+    },
+    {
+      "epoch": 0.8551724137931035,
+      "grad_norm": 0.8007012009620667,
+      "learning_rate": 2.5750622588835903e-05,
+      "loss": 0.7334,
+      "step": 310
+    },
+    {
+      "epoch": 0.8689655172413793,
+      "grad_norm": 0.9622930884361267,
+      "learning_rate": 2.5590476514989292e-05,
+      "loss": 0.7006,
+      "step": 315
+    },
+    {
+      "epoch": 0.8827586206896552,
+      "grad_norm": 0.8566970825195312,
+      "learning_rate": 2.5427886415119635e-05,
+      "loss": 0.7081,
+      "step": 320
+    },
+    {
+      "epoch": 0.896551724137931,
+      "grad_norm": 0.9438672661781311,
+      "learning_rate": 2.5262889811092413e-05,
+      "loss": 0.7227,
+      "step": 325
+    },
+    {
+      "epoch": 0.9103448275862069,
+      "grad_norm": 0.9426260590553284,
+      "learning_rate": 2.5095524780136094e-05,
+      "loss": 0.7151,
+      "step": 330
+    },
+    {
+      "epoch": 0.9241379310344827,
+      "grad_norm": 0.8882335424423218,
+      "learning_rate": 2.4925829946054857e-05,
+      "loss": 0.6595,
+      "step": 335
+    },
+    {
+      "epoch": 0.9379310344827586,
+      "grad_norm": 0.9620512127876282,
+      "learning_rate": 2.4753844470315136e-05,
+      "loss": 0.6835,
+      "step": 340
+    },
+    {
+      "epoch": 0.9517241379310345,
+      "grad_norm": 0.9925925135612488,
+      "learning_rate": 2.4579608043008074e-05,
+      "loss": 0.6816,
+      "step": 345
+    },
+    {
+      "epoch": 0.9655172413793104,
+      "grad_norm": 1.0093214511871338,
+      "learning_rate": 2.4403160873690064e-05,
+      "loss": 0.6588,
+      "step": 350
+    },
+    {
+      "epoch": 0.9793103448275862,
+      "grad_norm": 1.0236927270889282,
+      "learning_rate": 2.4224543682103302e-05,
+      "loss": 0.6208,
+      "step": 355
+    },
+    {
+      "epoch": 0.993103448275862,
+      "grad_norm": 0.9697800278663635,
+      "learning_rate": 2.4043797688778682e-05,
+      "loss": 0.7097,
+      "step": 360
+    },
+    {
+      "epoch": 1.0055172413793103,
+      "grad_norm": 0.9961010217666626,
+      "learning_rate": 2.3860964605523097e-05,
+      "loss": 0.6101,
+      "step": 365
+    },
+    {
+      "epoch": 1.0193103448275862,
+      "grad_norm": 0.9595643877983093,
+      "learning_rate": 2.3676086625793353e-05,
+      "loss": 0.5552,
+      "step": 370
+    },
+    {
+      "epoch": 1.033103448275862,
+      "grad_norm": 0.975598931312561,
+      "learning_rate": 2.348920641495893e-05,
+      "loss": 0.568,
+      "step": 375
+    },
+    {
+      "epoch": 1.046896551724138,
+      "grad_norm": 1.0802812576293945,
+      "learning_rate": 2.330036710045586e-05,
+      "loss": 0.5319,
+      "step": 380
+    },
+    {
+      "epoch": 1.0606896551724139,
+      "grad_norm": 0.9727413058280945,
+      "learning_rate": 2.3109612261833967e-05,
+      "loss": 0.5434,
+      "step": 385
+    },
+    {
+      "epoch": 1.0744827586206895,
+      "grad_norm": 0.9035729765892029,
+      "learning_rate": 2.291698592069972e-05,
+      "loss": 0.5524,
+      "step": 390
+    },
+    {
+      "epoch": 1.0882758620689654,
+      "grad_norm": 1.0716482400894165,
+      "learning_rate": 2.272253253055716e-05,
+      "loss": 0.5655,
+      "step": 395
+    },
+    {
+      "epoch": 1.1020689655172413,
+      "grad_norm": 0.945554256439209,
+      "learning_rate": 2.2526296966549073e-05,
+      "loss": 0.57,
+      "step": 400
+    },
+    {
+      "epoch": 1.1158620689655172,
+      "grad_norm": 1.1737520694732666,
+      "learning_rate": 2.2328324515100895e-05,
+      "loss": 0.5429,
+      "step": 405
+    },
+    {
+      "epoch": 1.129655172413793,
+      "grad_norm": 0.9815288782119751,
+      "learning_rate": 2.212866086346971e-05,
+      "loss": 0.5188,
+      "step": 410
+    },
+    {
+      "epoch": 1.143448275862069,
+      "grad_norm": 1.0767860412597656,
+      "learning_rate": 2.192735208920074e-05,
+      "loss": 0.5632,
+      "step": 415
+    },
+    {
+      "epoch": 1.1572413793103449,
+      "grad_norm": 0.951698362827301,
+      "learning_rate": 2.1724444649493733e-05,
+      "loss": 0.5117,
+      "step": 420
+    },
+    {
+      "epoch": 1.1710344827586208,
+      "grad_norm": 1.0386687517166138,
+      "learning_rate": 2.151998537048179e-05,
+      "loss": 0.5415,
+      "step": 425
+    },
+    {
+      "epoch": 1.1848275862068967,
+      "grad_norm": 1.0586196184158325,
+      "learning_rate": 2.1314021436425026e-05,
+      "loss": 0.5628,
+      "step": 430
+    },
+    {
+      "epoch": 1.1986206896551723,
+      "grad_norm": 0.9947938919067383,
+      "learning_rate": 2.1106600378821566e-05,
+      "loss": 0.5179,
+      "step": 435
+    },
+    {
+      "epoch": 1.2124137931034482,
+      "grad_norm": 1.004014492034912,
+      "learning_rate": 2.089777006543844e-05,
+      "loss": 0.517,
+      "step": 440
+    },
+    {
+      "epoch": 1.226206896551724,
+      "grad_norm": 1.0360642671585083,
+      "learning_rate": 2.0687578689264846e-05,
+      "loss": 0.5087,
+      "step": 445
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.9804148077964783,
+      "learning_rate": 2.0476074757390377e-05,
+      "loss": 0.5337,
+      "step": 450
+    },
+    {
+      "epoch": 1.2537931034482759,
+      "grad_norm": 1.0974323749542236,
+      "learning_rate": 2.0263307079810774e-05,
+      "loss": 0.5384,
+      "step": 455
+    },
+    {
+      "epoch": 1.2675862068965518,
+      "grad_norm": 0.999039888381958,
+      "learning_rate": 2.0049324758163714e-05,
+      "loss": 0.5006,
+      "step": 460
+    },
+    {
+      "epoch": 1.2813793103448277,
+      "grad_norm": 1.0984313488006592,
+      "learning_rate": 1.9834177174397403e-05,
+      "loss": 0.4868,
+      "step": 465
+    },
+    {
+      "epoch": 1.2951724137931033,
+      "grad_norm": 0.9641954898834229,
+      "learning_rate": 1.961791397937437e-05,
+      "loss": 0.5264,
+      "step": 470
+    },
+    {
+      "epoch": 1.3089655172413792,
+      "grad_norm": 1.0755330324172974,
+      "learning_rate": 1.940058508141324e-05,
+      "loss": 0.4847,
+      "step": 475
+    },
+    {
+      "epoch": 1.322758620689655,
+      "grad_norm": 1.0769026279449463,
+      "learning_rate": 1.9182240634771143e-05,
+      "loss": 0.505,
+      "step": 480
+    },
+    {
+      "epoch": 1.336551724137931,
+      "grad_norm": 1.0196845531463623,
+      "learning_rate": 1.8962931028069292e-05,
+      "loss": 0.5788,
+      "step": 485
+    },
+    {
+      "epoch": 1.3503448275862069,
+      "grad_norm": 1.0171618461608887,
+      "learning_rate": 1.8742706872664516e-05,
+      "loss": 0.4761,
+      "step": 490
+    },
+    {
+      "epoch": 1.3641379310344828,
+      "grad_norm": 1.1664354801177979,
+      "learning_rate": 1.852161899096938e-05,
+      "loss": 0.5073,
+      "step": 495
+    },
+    {
+      "epoch": 1.3779310344827587,
+      "grad_norm": 1.1605677604675293,
+      "learning_rate": 1.8299718404723604e-05,
+      "loss": 0.4569,
+      "step": 500
+    },
+    {
+      "epoch": 1.3917241379310346,
+      "grad_norm": 1.0353474617004395,
+      "learning_rate": 1.8077056323219468e-05,
+      "loss": 0.4463,
+      "step": 505
+    },
+    {
+      "epoch": 1.4055172413793104,
+      "grad_norm": 1.0919967889785767,
+      "learning_rate": 1.7853684131483973e-05,
+      "loss": 0.4703,
+      "step": 510
+    },
+    {
+      "epoch": 1.4193103448275863,
+      "grad_norm": 1.0212621688842773,
+      "learning_rate": 1.7629653378420375e-05,
+      "loss": 0.4504,
+      "step": 515
+    },
+    {
+      "epoch": 1.433103448275862,
+      "grad_norm": 1.1534277200698853,
+      "learning_rate": 1.7405015764911986e-05,
+      "loss": 0.48,
+      "step": 520
+    },
+    {
+      "epoch": 1.446896551724138,
+      "grad_norm": 1.0325161218643188,
+      "learning_rate": 1.7179823131890833e-05,
+      "loss": 0.4862,
+      "step": 525
+    },
+    {
+      "epoch": 1.4606896551724138,
+      "grad_norm": 0.9828569293022156,
+      "learning_rate": 1.6954127448374036e-05,
+      "loss": 0.4161,
+      "step": 530
+    },
+    {
+      "epoch": 1.4744827586206897,
+      "grad_norm": 1.2551393508911133,
+      "learning_rate": 1.6727980799470606e-05,
+      "loss": 0.4713,
+      "step": 535
+    },
+    {
+      "epoch": 1.4882758620689656,
+      "grad_norm": 1.0653741359710693,
+      "learning_rate": 1.6501435374361478e-05,
+      "loss": 0.4317,
+      "step": 540
+    },
+    {
+      "epoch": 1.5020689655172412,
+      "grad_norm": 1.1682952642440796,
+      "learning_rate": 1.627454345425548e-05,
+      "loss": 0.4717,
+      "step": 545
+    },
+    {
+      "epoch": 1.5158620689655171,
+      "grad_norm": 1.0231668949127197,
+      "learning_rate": 1.6047357400324123e-05,
+      "loss": 0.4609,
+      "step": 550
+    },
+    {
+      "epoch": 1.529655172413793,
+      "grad_norm": 1.1539297103881836,
+      "learning_rate": 1.5819929641617903e-05,
+      "loss": 0.4365,
+      "step": 555
+    },
+    {
+      "epoch": 1.543448275862069,
+      "grad_norm": 1.0333889722824097,
+      "learning_rate": 1.5592312662966914e-05,
+      "loss": 0.4506,
+      "step": 560
+    },
+    {
+      "epoch": 1.5572413793103448,
+      "grad_norm": 1.0939481258392334,
+      "learning_rate": 1.536455899286866e-05,
+      "loss": 0.4436,
+      "step": 565
+    },
+    {
+      "epoch": 1.5710344827586207,
+      "grad_norm": 1.1466500759124756,
+      "learning_rate": 1.5136721191365721e-05,
+      "loss": 0.4328,
+      "step": 570
+    },
+    {
+      "epoch": 1.5848275862068966,
+      "grad_norm": 1.1545034646987915,
+      "learning_rate": 1.4908851837916164e-05,
+      "loss": 0.4178,
+      "step": 575
+    },
+    {
+      "epoch": 1.5986206896551725,
+      "grad_norm": 1.4069446325302124,
+      "learning_rate": 1.4681003519259503e-05,
+      "loss": 0.4417,
+      "step": 580
+    },
+    {
+      "epoch": 1.6124137931034483,
+      "grad_norm": 1.1030892133712769,
+      "learning_rate": 1.4453228817280906e-05,
+      "loss": 0.4325,
+      "step": 585
+    },
+    {
+      "epoch": 1.6262068965517242,
+      "grad_norm": 1.0566986799240112,
+      "learning_rate": 1.4225580296876608e-05,
+      "loss": 0.4095,
+      "step": 590
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 1.0491068363189697,
+      "learning_rate": 1.3998110493823178e-05,
+      "loss": 0.4327,
+      "step": 595
+    },
+    {
+      "epoch": 1.653793103448276,
+      "grad_norm": 1.0508897304534912,
+      "learning_rate": 1.3770871902653543e-05,
+      "loss": 0.4275,
+      "step": 600
+    },
+    {
+      "epoch": 1.6675862068965517,
+      "grad_norm": 1.1155571937561035,
+      "learning_rate": 1.354391696454251e-05,
+      "loss": 0.4075,
+      "step": 605
+    },
+    {
+      "epoch": 1.6813793103448276,
+      "grad_norm": 1.0396450757980347,
+      "learning_rate": 1.3317298055204635e-05,
+      "loss": 0.4039,
+      "step": 610
+    },
+    {
+      "epoch": 1.6951724137931035,
+      "grad_norm": 1.2020153999328613,
+      "learning_rate": 1.3091067472807119e-05,
+      "loss": 0.391,
+      "step": 615
+    },
+    {
+      "epoch": 1.7089655172413794,
+      "grad_norm": 1.1432818174362183,
+      "learning_rate": 1.2865277425900725e-05,
+      "loss": 0.4122,
+      "step": 620
+    },
+    {
+      "epoch": 1.722758620689655,
+      "grad_norm": 1.072961449623108,
+      "learning_rate": 1.2639980021371252e-05,
+      "loss": 0.3879,
+      "step": 625
+    },
+    {
+      "epoch": 1.736551724137931,
+      "grad_norm": 1.0933804512023926,
+      "learning_rate": 1.2415227252414554e-05,
+      "loss": 0.3905,
+      "step": 630
+    },
+    {
+      "epoch": 1.7503448275862068,
+      "grad_norm": 1.3119415044784546,
+      "learning_rate": 1.219107098653775e-05,
+      "loss": 0.4034,
+      "step": 635
+    },
+    {
+      "epoch": 1.7641379310344827,
+      "grad_norm": 1.0820121765136719,
+      "learning_rate": 1.196756295358948e-05,
+      "loss": 0.421,
+      "step": 640
+    },
+    {
+      "epoch": 1.7779310344827586,
+      "grad_norm": 1.1389496326446533,
+      "learning_rate": 1.174475473382186e-05,
+      "loss": 0.4228,
+      "step": 645
+    },
+    {
+      "epoch": 1.7917241379310345,
+      "grad_norm": 0.9890799522399902,
+      "learning_rate": 1.1522697745987076e-05,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 1.8055172413793104,
+      "grad_norm": 1.186639428138733,
+      "learning_rate": 1.1301443235471116e-05,
+      "loss": 0.3934,
+      "step": 655
+    },
+    {
+      "epoch": 1.8193103448275862,
+      "grad_norm": 1.17351233959198,
+      "learning_rate": 1.1081042262467602e-05,
+      "loss": 0.3634,
+      "step": 660
+    },
+    {
+      "epoch": 1.8331034482758621,
+      "grad_norm": 1.2075047492980957,
+      "learning_rate": 1.0861545690194335e-05,
+      "loss": 0.3863,
+      "step": 665
+    },
+    {
+      "epoch": 1.846896551724138,
+      "grad_norm": 1.1475646495819092,
+      "learning_rate": 1.0643004173155261e-05,
+      "loss": 0.4353,
+      "step": 670
+    },
+    {
+      "epoch": 1.860689655172414,
+      "grad_norm": 1.2288826704025269,
+      "learning_rate": 1.04254681454507e-05,
+      "loss": 0.3821,
+      "step": 675
+    },
+    {
+      "epoch": 1.8744827586206898,
+      "grad_norm": 1.0479319095611572,
+      "learning_rate": 1.0208987809138299e-05,
+      "loss": 0.3929,
+      "step": 680
+    },
+    {
+      "epoch": 1.8882758620689655,
+      "grad_norm": 1.2405054569244385,
+      "learning_rate": 9.993613122647663e-06,
+      "loss": 0.3905,
+      "step": 685
+    },
+    {
+      "epoch": 1.9020689655172414,
+      "grad_norm": 1.0721787214279175,
+      "learning_rate": 9.779393789251134e-06,
+      "loss": 0.4025,
+      "step": 690
+    },
+    {
+      "epoch": 1.9158620689655173,
+      "grad_norm": 1.1694467067718506,
+      "learning_rate": 9.566379245593498e-06,
+      "loss": 0.3968,
+      "step": 695
+    },
+    {
+      "epoch": 1.9296551724137931,
+      "grad_norm": 1.201874852180481,
+      "learning_rate": 9.35461865028316e-06,
+      "loss": 0.3792,
+      "step": 700
+    },
+    {
+      "epoch": 1.9434482758620688,
+      "grad_norm": 1.0933696031570435,
+      "learning_rate": 9.144160872547579e-06,
+      "loss": 0.3552,
+      "step": 705
+    },
+    {
+      "epoch": 1.9572413793103447,
+      "grad_norm": 1.077457070350647,
+      "learning_rate": 8.93505448095539e-06,
+      "loss": 0.3673,
+      "step": 710
+    },
+    {
+      "epoch": 1.9710344827586206,
+      "grad_norm": 1.1806907653808594,
+      "learning_rate": 8.727347732207974e-06,
+      "loss": 0.336,
+      "step": 715
+    },
+    {
+      "epoch": 1.9848275862068965,
+      "grad_norm": 1.081370234489441,
+      "learning_rate": 8.521088560002963e-06,
+      "loss": 0.3658,
+      "step": 720
+    },
+    {
+      "epoch": 1.9986206896551724,
+      "grad_norm": 1.1178879737854004,
+      "learning_rate": 8.316324563972315e-06,
+      "loss": 0.3683,
+      "step": 725
+    },
+    {
+      "epoch": 2.0110344827586206,
+      "grad_norm": 1.1146636009216309,
+      "learning_rate": 8.113102998697463e-06,
+      "loss": 0.2828,
+      "step": 730
+    },
+    {
+      "epoch": 2.0248275862068965,
+      "grad_norm": 1.2588555812835693,
+      "learning_rate": 7.911470762804104e-06,
+      "loss": 0.2985,
+      "step": 735
+    },
+    {
+      "epoch": 2.0386206896551724,
+      "grad_norm": 1.2606914043426514,
+      "learning_rate": 7.711474388139112e-06,
+      "loss": 0.3074,
+      "step": 740
+    },
+    {
+      "epoch": 2.0524137931034483,
+      "grad_norm": 1.11820387840271,
+      "learning_rate": 7.513160029032141e-06,
+      "loss": 0.294,
+      "step": 745
+    },
+    {
+      "epoch": 2.066206896551724,
+      "grad_norm": 1.0435316562652588,
+      "learning_rate": 7.316573451644302e-06,
+      "loss": 0.2875,
+      "step": 750
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.3059062957763672,
+      "learning_rate": 7.1217600234064315e-06,
+      "loss": 0.2812,
+      "step": 755
+    },
+    {
+      "epoch": 2.093793103448276,
+      "grad_norm": 1.2035919427871704,
+      "learning_rate": 6.928764702549411e-06,
+      "loss": 0.2788,
+      "step": 760
+    },
+    {
+      "epoch": 2.107586206896552,
+      "grad_norm": 1.2044228315353394,
+      "learning_rate": 6.737632027728874e-06,
+      "loss": 0.2729,
+      "step": 765
+    },
+    {
+      "epoch": 2.1213793103448277,
+      "grad_norm": 1.1182624101638794,
+      "learning_rate": 6.5484061077467714e-06,
+      "loss": 0.3289,
+      "step": 770
+    },
+    {
+      "epoch": 2.1351724137931036,
+      "grad_norm": 1.1793614625930786,
+      "learning_rate": 6.361130611372102e-06,
+      "loss": 0.3182,
+      "step": 775
+    },
+    {
+      "epoch": 2.148965517241379,
+      "grad_norm": 1.0818067789077759,
+      "learning_rate": 6.175848757263268e-06,
+      "loss": 0.2638,
+      "step": 780
+    },
+    {
+      "epoch": 2.162758620689655,
+      "grad_norm": 1.1334140300750732,
+      "learning_rate": 5.992603303994189e-06,
+      "loss": 0.2618,
+      "step": 785
+    },
+    {
+      "epoch": 2.176551724137931,
+      "grad_norm": 1.2197356224060059,
+      "learning_rate": 5.811436540186702e-06,
+      "loss": 0.2847,
+      "step": 790
+    },
+    {
+      "epoch": 2.1903448275862067,
+      "grad_norm": 1.2150832414627075,
+      "learning_rate": 5.632390274751355e-06,
+      "loss": 0.3038,
+      "step": 795
+    },
+    {
+      "epoch": 2.2041379310344826,
+      "grad_norm": 1.0440075397491455,
+      "learning_rate": 5.455505827238925e-06,
+      "loss": 0.2883,
+      "step": 800
+    },
+    {
+      "epoch": 2.2179310344827585,
+      "grad_norm": 1.1683392524719238,
+      "learning_rate": 5.280824018304839e-06,
+      "loss": 0.3094,
+      "step": 805
+    },
+    {
+      "epoch": 2.2317241379310344,
+      "grad_norm": 1.0837711095809937,
+      "learning_rate": 5.108385160288808e-06,
+      "loss": 0.2645,
+      "step": 810
+    },
+    {
+      "epoch": 2.2455172413793103,
+      "grad_norm": 1.3255565166473389,
+      "learning_rate": 4.938229047911652e-06,
+      "loss": 0.2691,
+      "step": 815
+    },
+    {
+      "epoch": 2.259310344827586,
+      "grad_norm": 1.1892286539077759,
+      "learning_rate": 4.770394949091679e-06,
+      "loss": 0.3056,
+      "step": 820
+    },
+    {
+      "epoch": 2.273103448275862,
+      "grad_norm": 1.0651421546936035,
+      "learning_rate": 4.604921595882591e-06,
+      "loss": 0.2741,
+      "step": 825
+    },
+    {
+      "epoch": 2.286896551724138,
+      "grad_norm": 1.3403918743133545,
+      "learning_rate": 4.441847175535054e-06,
+      "loss": 0.329,
+      "step": 830
+    },
+    {
+      "epoch": 2.300689655172414,
+      "grad_norm": 1.1145679950714111,
+      "learning_rate": 4.281209321684011e-06,
+      "loss": 0.2802,
+      "step": 835
+    },
+    {
+      "epoch": 2.3144827586206898,
+      "grad_norm": 1.1731306314468384,
+      "learning_rate": 4.123045105663743e-06,
+      "loss": 0.2901,
+      "step": 840
+    },
+    {
+      "epoch": 2.3282758620689656,
+      "grad_norm": 1.0816731452941895,
+      "learning_rate": 3.967391027952709e-06,
+      "loss": 0.2506,
+      "step": 845
+    },
+    {
+      "epoch": 2.3420689655172415,
+      "grad_norm": 1.2338043451309204,
+      "learning_rate": 3.8142830097500982e-06,
+      "loss": 0.2822,
+      "step": 850
+    },
+    {
+      "epoch": 2.3558620689655174,
+      "grad_norm": 1.0032469034194946,
+      "learning_rate": 3.6637563846861278e-06,
+      "loss": 0.2605,
+      "step": 855
+    },
+    {
+      "epoch": 2.3696551724137933,
+      "grad_norm": 1.2435439825057983,
+      "learning_rate": 3.515845890667835e-06,
+      "loss": 0.2887,
+      "step": 860
+    },
+    {
+      "epoch": 2.3834482758620688,
+      "grad_norm": 1.0096521377563477,
+      "learning_rate": 3.370585661862462e-06,
+      "loss": 0.2821,
+      "step": 865
+    },
+    {
+      "epoch": 2.3972413793103446,
+      "grad_norm": 1.1986134052276611,
+      "learning_rate": 3.2280092208200853e-06,
+      "loss": 0.2634,
+      "step": 870
+    },
+    {
+      "epoch": 2.4110344827586205,
+      "grad_norm": 1.4550834894180298,
+      "learning_rate": 3.0881494707374318e-06,
+      "loss": 0.2717,
+      "step": 875
+    },
+    {
+      "epoch": 2.4248275862068964,
+      "grad_norm": 1.0646744966506958,
+      "learning_rate": 2.951038687864607e-06,
+      "loss": 0.2631,
+      "step": 880
+    },
+    {
+      "epoch": 2.4386206896551723,
+      "grad_norm": 1.1810719966888428,
+      "learning_rate": 2.8167085140565257e-06,
+      "loss": 0.2866,
+      "step": 885
+    },
+    {
+      "epoch": 2.452413793103448,
+      "grad_norm": 1.0311094522476196,
+      "learning_rate": 2.68518994947074e-06,
+      "loss": 0.2762,
+      "step": 890
+    },
+    {
+      "epoch": 2.466206896551724,
+      "grad_norm": 1.103126049041748,
+      "learning_rate": 2.556513345413341e-06,
+      "loss": 0.2953,
+      "step": 895
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.0111137628555298,
+      "learning_rate": 2.4307083973346146e-06,
+      "loss": 0.2627,
+      "step": 900
+    },
+    {
+      "epoch": 2.493793103448276,
+      "grad_norm": 1.2547590732574463,
+      "learning_rate": 2.3078041379760562e-06,
+      "loss": 0.2782,
+      "step": 905
+    },
+    {
+      "epoch": 2.5075862068965518,
+      "grad_norm": 1.2016406059265137,
+      "learning_rate": 2.187828930670299e-06,
+      "loss": 0.272,
+      "step": 910
+    },
+    {
+      "epoch": 2.5213793103448277,
+      "grad_norm": 1.184328317642212,
+      "learning_rate": 2.070810462795562e-06,
+      "loss": 0.2596,
+      "step": 915
+    },
+    {
+      "epoch": 2.5351724137931035,
+      "grad_norm": 1.1365069150924683,
+      "learning_rate": 1.9567757393860736e-06,
+      "loss": 0.2611,
+      "step": 920
+    },
+    {
+      "epoch": 2.5489655172413794,
+      "grad_norm": 1.1061021089553833,
+      "learning_rate": 1.8457510768999276e-06,
+      "loss": 0.2733,
+      "step": 925
+    },
+    {
+      "epoch": 2.5627586206896553,
+      "grad_norm": 1.1069626808166504,
+      "learning_rate": 1.737762097145925e-06,
+      "loss": 0.2682,
+      "step": 930
+    },
+    {
+      "epoch": 2.576551724137931,
+      "grad_norm": 1.1777106523513794,
+      "learning_rate": 1.6328337213706545e-06,
+      "loss": 0.238,
+      "step": 935
+    },
+    {
+      "epoch": 2.5903448275862067,
+      "grad_norm": 1.2002151012420654,
+      "learning_rate": 1.5309901645072776e-06,
+      "loss": 0.2872,
+      "step": 940
+    },
+    {
+      "epoch": 2.604137931034483,
+      "grad_norm": 1.1341054439544678,
+      "learning_rate": 1.4322549295873006e-06,
+      "loss": 0.276,
+      "step": 945
+    },
+    {
+      "epoch": 2.6179310344827584,
+      "grad_norm": 1.1705831289291382,
+      "learning_rate": 1.3366508023166619e-06,
+      "loss": 0.2582,
+      "step": 950
+    },
+    {
+      "epoch": 2.6317241379310343,
+      "grad_norm": 1.139310359954834,
+      "learning_rate": 1.2441998458173142e-06,
+      "loss": 0.2603,
+      "step": 955
+    },
+    {
+      "epoch": 2.64551724137931,
+      "grad_norm": 1.1581835746765137,
+      "learning_rate": 1.1549233955356143e-06,
+      "loss": 0.2684,
+      "step": 960
+    },
+    {
+      "epoch": 2.659310344827586,
+      "grad_norm": 1.1978882551193237,
+      "learning_rate": 1.0688420543186033e-06,
+      "loss": 0.225,
+      "step": 965
+    },
+    {
+      "epoch": 2.673103448275862,
+      "grad_norm": 1.1953506469726562,
+      "learning_rate": 9.859756876593723e-07,
+      "loss": 0.3167,
+      "step": 970
+    },
+    {
+      "epoch": 2.686896551724138,
+      "grad_norm": 1.2417023181915283,
+      "learning_rate": 9.063434191125808e-07,
+      "loss": 0.2754,
+      "step": 975
+    },
+    {
+      "epoch": 2.7006896551724138,
+      "grad_norm": 1.1282715797424316,
+      "learning_rate": 8.299636258812199e-07,
+      "loss": 0.2584,
+      "step": 980
+    },
+    {
+      "epoch": 2.7144827586206897,
+      "grad_norm": 1.0809553861618042,
+      "learning_rate": 7.568539345755692e-07,
+      "loss": 0.2556,
+      "step": 985
+    },
+    {
+      "epoch": 2.7282758620689656,
+      "grad_norm": 1.2238688468933105,
+      "learning_rate": 6.870312171454296e-07,
+      "loss": 0.261,
+      "step": 990
+    },
+    {
+      "epoch": 2.7420689655172414,
+      "grad_norm": 1.2479712963104248,
+      "learning_rate": 6.205115869864686e-07,
+      "loss": 0.2679,
+      "step": 995
+    },
+    {
+      "epoch": 2.7558620689655173,
+      "grad_norm": 1.1291388273239136,
+      "learning_rate": 5.573103952216457e-07,
+      "loss": 0.2724,
+      "step": 1000
+    },
+    {
+      "epoch": 2.769655172413793,
+      "grad_norm": 1.117020845413208,
+      "learning_rate": 4.974422271585327e-07,
+      "loss": 0.2859,
+      "step": 1005
+    },
+    {
+      "epoch": 2.783448275862069,
+      "grad_norm": 1.1507805585861206,
+      "learning_rate": 4.4092089892339427e-07,
+      "loss": 0.2696,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7972413793103446,
+      "grad_norm": 1.1751760244369507,
+      "learning_rate": 3.877594542727503e-07,
+      "loss": 0.2369,
+      "step": 1015
+    },
+    {
+      "epoch": 2.811034482758621,
+      "grad_norm": 1.1543550491333008,
+      "learning_rate": 3.379701615831837e-07,
+      "loss": 0.2949,
+      "step": 1020
+    },
+    {
+      "epoch": 2.8248275862068963,
+      "grad_norm": 1.1876450777053833,
+      "learning_rate": 2.9156451102011704e-07,
+      "loss": 0.2649,
+      "step": 1025
+    },
+    {
+      "epoch": 2.8386206896551727,
+      "grad_norm": 1.0123815536499023,
+      "learning_rate": 2.4855321188614e-07,
+      "loss": 0.2511,
+      "step": 1030
+    },
+    {
+      "epoch": 2.852413793103448,
+      "grad_norm": 1.197932481765747,
+      "learning_rate": 2.089461901495715e-07,
+      "loss": 0.2389,
+      "step": 1035
+    },
+    {
+      "epoch": 2.866206896551724,
+      "grad_norm": 1.1045258045196533,
+      "learning_rate": 1.7275258615378375e-07,
+      "loss": 0.2871,
+      "step": 1040
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.2319388389587402,
+      "learning_rate": 1.399807525078345e-07,
+      "loss": 0.258,
+      "step": 1045
+    },
+    {
+      "epoch": 2.893793103448276,
+      "grad_norm": 1.1577198505401611,
+      "learning_rate": 1.1063825215887558e-07,
+      "loss": 0.2656,
+      "step": 1050
+    },
+    {
+      "epoch": 2.9075862068965517,
+      "grad_norm": 1.090755581855774,
+      "learning_rate": 8.473185664682415e-08,
+      "loss": 0.2644,
+      "step": 1055
+    },
+    {
+      "epoch": 2.9213793103448276,
+      "grad_norm": 1.038208246231079,
+      "learning_rate": 6.226754454164263e-08,
+      "loss": 0.2612,
+      "step": 1060
+    },
+    {
+      "epoch": 2.9351724137931035,
+      "grad_norm": 0.9841290712356567,
+      "learning_rate": 4.325050006363906e-08,
+      "loss": 0.2677,
+      "step": 1065
+    },
+    {
+      "epoch": 2.9489655172413793,
+      "grad_norm": 1.1243023872375488,
+      "learning_rate": 2.7685111887059134e-08,
+      "loss": 0.2704,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9627586206896552,
+      "grad_norm": 1.0708945989608765,
+      "learning_rate": 1.5574972127302967e-08,
+      "loss": 0.2381,
+      "step": 1075
+    },
+    {
+      "epoch": 2.976551724137931,
+      "grad_norm": 1.0898057222366333,
+      "learning_rate": 6.922875511943261e-09,
+      "loss": 0.2489,
+      "step": 1080
+    },
+    {
+      "epoch": 2.990344827586207,
+      "grad_norm": 1.0777441263198853,
+      "learning_rate": 1.7308187357695237e-09,
+      "loss": 0.2486,
+      "step": 1085
+    },
+    {
+      "epoch": 3.0,
+      "step": 1089,
+      "total_flos": 1.5435133944793661e+18,
+      "train_loss": 0.5637503340224589,
+      "train_runtime": 727.9191,
+      "train_samples_per_second": 47.799,
+      "train_steps_per_second": 1.496
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1089,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5435133944793661e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

123_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7968c31f2a463f159778c55978c05fcde1e70522277c869dcd01a4c0d77a1da2
+size 8273

123_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff