Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

138_128_e3_3e-5/adapter_config.json +39 -0
138_128_e3_3e-5/adapter_model.safetensors +3 -0
138_128_e3_3e-5/added_tokens.json +9 -0
138_128_e3_3e-5/all_results.json +9 -0
138_128_e3_3e-5/chat_template.jinja +62 -0
138_128_e3_3e-5/config.json +32 -0
138_128_e3_3e-5/merges.txt +0 -0
138_128_e3_3e-5/special_tokens_map.json +33 -0
138_128_e3_3e-5/tokenizer.json +0 -0
138_128_e3_3e-5/tokenizer_config.json +234 -0
138_128_e3_3e-5/train_results.json +9 -0
138_128_e3_3e-5/trainer_state.json +1730 -0
138_128_e3_3e-5/training_args.bin +3 -0
138_128_e3_3e-5/vocab.json +0 -0

138_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

138_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e17bbf66c9bb06fee64eed8ac63b994041b5024a8261e419cfa55d9048e8cb00
+size 791751704

138_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

138_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5419277279871631e+18,
+    "train_loss": 0.36778768146438384,
+    "train_runtime": 743.0643,
+    "train_samples": 12843,
+    "train_samples_per_second": 51.852,
+    "train_steps_per_second": 1.623
+}

138_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

138_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

138_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

138_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

138_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

138_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

138_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5419277279871631e+18,
+    "train_loss": 0.36778768146438384,
+    "train_runtime": 743.0643,
+    "train_samples": 12843,
+    "train_samples_per_second": 51.852,
+    "train_steps_per_second": 1.623
+}

138_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1730 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1206,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012453300124533,
+      "grad_norm": 2.323019027709961,
+      "learning_rate": 1.9672131147540985e-06,
+      "loss": 1.5186,
+      "step": 5
+    },
+    {
+      "epoch": 0.024906600249066,
+      "grad_norm": 1.8451788425445557,
+      "learning_rate": 4.426229508196722e-06,
+      "loss": 1.546,
+      "step": 10
+    },
+    {
+      "epoch": 0.037359900373599,
+      "grad_norm": 0.8245519995689392,
+      "learning_rate": 6.885245901639345e-06,
+      "loss": 1.4463,
+      "step": 15
+    },
+    {
+      "epoch": 0.049813200498132,
+      "grad_norm": 0.5792524814605713,
+      "learning_rate": 9.344262295081968e-06,
+      "loss": 1.4437,
+      "step": 20
+    },
+    {
+      "epoch": 0.062266500622665005,
+      "grad_norm": 0.5813663601875305,
+      "learning_rate": 1.180327868852459e-05,
+      "loss": 1.4231,
+      "step": 25
+    },
+    {
+      "epoch": 0.074719800747198,
+      "grad_norm": 0.46379655599594116,
+      "learning_rate": 1.4262295081967213e-05,
+      "loss": 1.3455,
+      "step": 30
+    },
+    {
+      "epoch": 0.08717310087173101,
+      "grad_norm": 0.5653128027915955,
+      "learning_rate": 1.6721311475409834e-05,
+      "loss": 1.3671,
+      "step": 35
+    },
+    {
+      "epoch": 0.099626400996264,
+      "grad_norm": 0.38007092475891113,
+      "learning_rate": 1.9180327868852462e-05,
+      "loss": 1.3294,
+      "step": 40
+    },
+    {
+      "epoch": 0.11207970112079702,
+      "grad_norm": 0.4134085774421692,
+      "learning_rate": 2.1639344262295084e-05,
+      "loss": 1.3512,
+      "step": 45
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "grad_norm": 0.4001520574092865,
+      "learning_rate": 2.4098360655737705e-05,
+      "loss": 1.3662,
+      "step": 50
+    },
+    {
+      "epoch": 0.136986301369863,
+      "grad_norm": 0.3832428753376007,
+      "learning_rate": 2.6557377049180327e-05,
+      "loss": 1.2305,
+      "step": 55
+    },
+    {
+      "epoch": 0.149439601494396,
+      "grad_norm": 0.4886322319507599,
+      "learning_rate": 2.901639344262295e-05,
+      "loss": 1.3002,
+      "step": 60
+    },
+    {
+      "epoch": 0.16189290161892902,
+      "grad_norm": 0.3790438771247864,
+      "learning_rate": 2.9999491852149543e-05,
+      "loss": 1.2409,
+      "step": 65
+    },
+    {
+      "epoch": 0.17434620174346202,
+      "grad_norm": 0.4461272060871124,
+      "learning_rate": 2.999638662885322e-05,
+      "loss": 1.2484,
+      "step": 70
+    },
+    {
+      "epoch": 0.18679950186799502,
+      "grad_norm": 0.42311185598373413,
+      "learning_rate": 2.9990459070319718e-05,
+      "loss": 1.1704,
+      "step": 75
+    },
+    {
+      "epoch": 0.199252801992528,
+      "grad_norm": 0.49046602845191956,
+      "learning_rate": 2.9981710292121587e-05,
+      "loss": 1.2178,
+      "step": 80
+    },
+    {
+      "epoch": 0.21170610211706103,
+      "grad_norm": 0.4725804626941681,
+      "learning_rate": 2.9970141940787794e-05,
+      "loss": 1.1202,
+      "step": 85
+    },
+    {
+      "epoch": 0.22415940224159403,
+      "grad_norm": 0.5257315635681152,
+      "learning_rate": 2.9955756193493843e-05,
+      "loss": 1.1791,
+      "step": 90
+    },
+    {
+      "epoch": 0.23661270236612703,
+      "grad_norm": 0.5636014342308044,
+      "learning_rate": 2.9938555757652027e-05,
+      "loss": 1.1442,
+      "step": 95
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "grad_norm": 0.541287899017334,
+      "learning_rate": 2.991854387040189e-05,
+      "loss": 1.1136,
+      "step": 100
+    },
+    {
+      "epoch": 0.261519302615193,
+      "grad_norm": 0.6065759658813477,
+      "learning_rate": 2.9895724298000995e-05,
+      "loss": 1.0232,
+      "step": 105
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.596156120300293,
+      "learning_rate": 2.9870101335116107e-05,
+      "loss": 1.0837,
+      "step": 110
+    },
+    {
+      "epoch": 0.286425902864259,
+      "grad_norm": 0.6259438991546631,
+      "learning_rate": 2.9841679804014938e-05,
+      "loss": 1.0564,
+      "step": 115
+    },
+    {
+      "epoch": 0.298879202988792,
+      "grad_norm": 0.6514918804168701,
+      "learning_rate": 2.981046505365859e-05,
+      "loss": 1.0463,
+      "step": 120
+    },
+    {
+      "epoch": 0.31133250311332505,
+      "grad_norm": 0.6386065483093262,
+      "learning_rate": 2.9776462958694873e-05,
+      "loss": 0.9898,
+      "step": 125
+    },
+    {
+      "epoch": 0.32378580323785805,
+      "grad_norm": 0.6327990293502808,
+      "learning_rate": 2.9739679918352686e-05,
+      "loss": 0.9676,
+      "step": 130
+    },
+    {
+      "epoch": 0.33623910336239105,
+      "grad_norm": 0.7084152698516846,
+      "learning_rate": 2.9700122855237685e-05,
+      "loss": 0.9841,
+      "step": 135
+    },
+    {
+      "epoch": 0.34869240348692404,
+      "grad_norm": 0.720888614654541,
+      "learning_rate": 2.965779921402944e-05,
+      "loss": 0.9515,
+      "step": 140
+    },
+    {
+      "epoch": 0.36114570361145704,
+      "grad_norm": 0.7035543918609619,
+      "learning_rate": 2.961271696008033e-05,
+      "loss": 0.8851,
+      "step": 145
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "grad_norm": 0.8451811671257019,
+      "learning_rate": 2.9564884577916463e-05,
+      "loss": 0.9972,
+      "step": 150
+    },
+    {
+      "epoch": 0.386052303860523,
+      "grad_norm": 0.8046403527259827,
+      "learning_rate": 2.951431106964088e-05,
+      "loss": 0.8648,
+      "step": 155
+    },
+    {
+      "epoch": 0.398505603985056,
+      "grad_norm": 0.7956090569496155,
+      "learning_rate": 2.9461005953239347e-05,
+      "loss": 0.9028,
+      "step": 160
+    },
+    {
+      "epoch": 0.410958904109589,
+      "grad_norm": 0.7928311228752136,
+      "learning_rate": 2.9404979260789064e-05,
+      "loss": 0.8463,
+      "step": 165
+    },
+    {
+      "epoch": 0.42341220423412207,
+      "grad_norm": 0.886175274848938,
+      "learning_rate": 2.934624153657061e-05,
+      "loss": 0.8589,
+      "step": 170
+    },
+    {
+      "epoch": 0.43586550435865506,
+      "grad_norm": 0.8902182579040527,
+      "learning_rate": 2.9284803835083507e-05,
+      "loss": 0.8596,
+      "step": 175
+    },
+    {
+      "epoch": 0.44831880448318806,
+      "grad_norm": 0.8826771378517151,
+      "learning_rate": 2.9220677718965747e-05,
+      "loss": 0.7837,
+      "step": 180
+    },
+    {
+      "epoch": 0.46077210460772106,
+      "grad_norm": 1.0196400880813599,
+      "learning_rate": 2.9153875256817696e-05,
+      "loss": 0.8106,
+      "step": 185
+    },
+    {
+      "epoch": 0.47322540473225405,
+      "grad_norm": 0.962088406085968,
+      "learning_rate": 2.9084409020930767e-05,
+      "loss": 0.8387,
+      "step": 190
+    },
+    {
+      "epoch": 0.48567870485678705,
+      "grad_norm": 0.8833624720573425,
+      "learning_rate": 2.9012292084921306e-05,
+      "loss": 0.7938,
+      "step": 195
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "grad_norm": 1.0000145435333252,
+      "learning_rate": 2.893753802127012e-05,
+      "loss": 0.7249,
+      "step": 200
+    },
+    {
+      "epoch": 0.5105853051058531,
+      "grad_norm": 0.9227517247200012,
+      "learning_rate": 2.8860160898768123e-05,
+      "loss": 0.7459,
+      "step": 205
+    },
+    {
+      "epoch": 0.523038605230386,
+      "grad_norm": 1.0462753772735596,
+      "learning_rate": 2.8780175279868577e-05,
+      "loss": 0.7374,
+      "step": 210
+    },
+    {
+      "epoch": 0.5354919053549191,
+      "grad_norm": 1.087003469467163,
+      "learning_rate": 2.8697596217946426e-05,
+      "loss": 0.705,
+      "step": 215
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 0.9915927648544312,
+      "learning_rate": 2.861243925446523e-05,
+      "loss": 0.7014,
+      "step": 220
+    },
+    {
+      "epoch": 0.5603985056039851,
+      "grad_norm": 1.0102894306182861,
+      "learning_rate": 2.8524720416052243e-05,
+      "loss": 0.715,
+      "step": 225
+    },
+    {
+      "epoch": 0.572851805728518,
+      "grad_norm": 1.088186264038086,
+      "learning_rate": 2.84344562114822e-05,
+      "loss": 0.697,
+      "step": 230
+    },
+    {
+      "epoch": 0.5853051058530511,
+      "grad_norm": 0.9590082764625549,
+      "learning_rate": 2.8341663628570328e-05,
+      "loss": 0.6556,
+      "step": 235
+    },
+    {
+      "epoch": 0.597758405977584,
+      "grad_norm": 0.9584166407585144,
+      "learning_rate": 2.824636013097524e-05,
+      "loss": 0.6216,
+      "step": 240
+    },
+    {
+      "epoch": 0.6102117061021171,
+      "grad_norm": 1.1738195419311523,
+      "learning_rate": 2.8148563654912257e-05,
+      "loss": 0.6812,
+      "step": 245
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "grad_norm": 1.0906168222427368,
+      "learning_rate": 2.8048292605777766e-05,
+      "loss": 0.6138,
+      "step": 250
+    },
+    {
+      "epoch": 0.635118306351183,
+      "grad_norm": 1.1087127923965454,
+      "learning_rate": 2.7945565854685348e-05,
+      "loss": 0.6539,
+      "step": 255
+    },
+    {
+      "epoch": 0.6475716064757161,
+      "grad_norm": 1.215636968612671,
+      "learning_rate": 2.7840402734914182e-05,
+      "loss": 0.5769,
+      "step": 260
+    },
+    {
+      "epoch": 0.660024906600249,
+      "grad_norm": 1.24242103099823,
+      "learning_rate": 2.773282303827052e-05,
+      "loss": 0.6366,
+      "step": 265
+    },
+    {
+      "epoch": 0.6724782067247821,
+      "grad_norm": 1.1535454988479614,
+      "learning_rate": 2.762284701136283e-05,
+      "loss": 0.6169,
+      "step": 270
+    },
+    {
+      "epoch": 0.684931506849315,
+      "grad_norm": 1.2355749607086182,
+      "learning_rate": 2.7510495351791397e-05,
+      "loss": 0.6002,
+      "step": 275
+    },
+    {
+      "epoch": 0.6973848069738481,
+      "grad_norm": 1.3292158842086792,
+      "learning_rate": 2.739578920425297e-05,
+      "loss": 0.6211,
+      "step": 280
+    },
+    {
+      "epoch": 0.709838107098381,
+      "grad_norm": 1.1527949571609497,
+      "learning_rate": 2.727875015656135e-05,
+      "loss": 0.5783,
+      "step": 285
+    },
+    {
+      "epoch": 0.7222914072229141,
+      "grad_norm": 1.1179882287979126,
+      "learning_rate": 2.7159400235584507e-05,
+      "loss": 0.5671,
+      "step": 290
+    },
+    {
+      "epoch": 0.7347447073474471,
+      "grad_norm": 1.2739359140396118,
+      "learning_rate": 2.703776190309914e-05,
+      "loss": 0.5554,
+      "step": 295
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "grad_norm": 0.9971826672554016,
+      "learning_rate": 2.691385805156329e-05,
+      "loss": 0.5924,
+      "step": 300
+    },
+    {
+      "epoch": 0.7596513075965131,
+      "grad_norm": 1.2298365831375122,
+      "learning_rate": 2.6787711999808026e-05,
+      "loss": 0.5327,
+      "step": 305
+    },
+    {
+      "epoch": 0.772104607721046,
+      "grad_norm": 1.261979103088379,
+      "learning_rate": 2.6659347488648763e-05,
+      "loss": 0.5346,
+      "step": 310
+    },
+    {
+      "epoch": 0.7845579078455791,
+      "grad_norm": 1.2995227575302124,
+      "learning_rate": 2.6528788676417238e-05,
+      "loss": 0.4872,
+      "step": 315
+    },
+    {
+      "epoch": 0.797011207970112,
+      "grad_norm": 1.2352875471115112,
+      "learning_rate": 2.6396060134414883e-05,
+      "loss": 0.5071,
+      "step": 320
+    },
+    {
+      "epoch": 0.8094645080946451,
+      "grad_norm": 1.1811374425888062,
+      "learning_rate": 2.6261186842288482e-05,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 1.282472848892212,
+      "learning_rate": 2.6124194183328992e-05,
+      "loss": 0.4938,
+      "step": 330
+    },
+    {
+      "epoch": 0.8343711083437111,
+      "grad_norm": 1.451218843460083,
+      "learning_rate": 2.5985107939694346e-05,
+      "loss": 0.5276,
+      "step": 335
+    },
+    {
+      "epoch": 0.8468244084682441,
+      "grad_norm": 1.2093241214752197,
+      "learning_rate": 2.5843954287557253e-05,
+      "loss": 0.4534,
+      "step": 340
+    },
+    {
+      "epoch": 0.8592777085927771,
+      "grad_norm": 1.2220991849899292,
+      "learning_rate": 2.5700759792178813e-05,
+      "loss": 0.5149,
+      "step": 345
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "grad_norm": 1.2086710929870605,
+      "learning_rate": 2.5555551402908896e-05,
+      "loss": 0.4665,
+      "step": 350
+    },
+    {
+      "epoch": 0.8841843088418431,
+      "grad_norm": 1.1186401844024658,
+      "learning_rate": 2.5408356448114255e-05,
+      "loss": 0.4481,
+      "step": 355
+    },
+    {
+      "epoch": 0.8966376089663761,
+      "grad_norm": 1.3668131828308105,
+      "learning_rate": 2.5259202630035296e-05,
+      "loss": 0.4252,
+      "step": 360
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.2898318767547607,
+      "learning_rate": 2.51081180195725e-05,
+      "loss": 0.4391,
+      "step": 365
+    },
+    {
+      "epoch": 0.9215442092154421,
+      "grad_norm": 1.1851130723953247,
+      "learning_rate": 2.4955131051003427e-05,
+      "loss": 0.445,
+      "step": 370
+    },
+    {
+      "epoch": 0.933997509339975,
+      "grad_norm": 1.300045371055603,
+      "learning_rate": 2.4800270516631376e-05,
+      "loss": 0.409,
+      "step": 375
+    },
+    {
+      "epoch": 0.9464508094645081,
+      "grad_norm": 1.2191306352615356,
+      "learning_rate": 2.4643565561366644e-05,
+      "loss": 0.4263,
+      "step": 380
+    },
+    {
+      "epoch": 0.958904109589041,
+      "grad_norm": 1.302882432937622,
+      "learning_rate": 2.4485045677241415e-05,
+      "loss": 0.3978,
+      "step": 385
+    },
+    {
+      "epoch": 0.9713574097135741,
+      "grad_norm": 1.3422915935516357,
+      "learning_rate": 2.4324740697859326e-05,
+      "loss": 0.4484,
+      "step": 390
+    },
+    {
+      "epoch": 0.9838107098381071,
+      "grad_norm": 1.2685996294021606,
+      "learning_rate": 2.4162680792780775e-05,
+      "loss": 0.3977,
+      "step": 395
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "grad_norm": 1.2416200637817383,
+      "learning_rate": 2.399889646184494e-05,
+      "loss": 0.3922,
+      "step": 400
+    },
+    {
+      "epoch": 1.0074719800747198,
+      "grad_norm": 1.244289517402649,
+      "learning_rate": 2.3833418529429728e-05,
+      "loss": 0.3346,
+      "step": 405
+    },
+    {
+      "epoch": 1.0199252801992529,
+      "grad_norm": 1.170119047164917,
+      "learning_rate": 2.366627813865055e-05,
+      "loss": 0.2807,
+      "step": 410
+    },
+    {
+      "epoch": 1.0323785803237857,
+      "grad_norm": 1.3378503322601318,
+      "learning_rate": 2.349750674549918e-05,
+      "loss": 0.3212,
+      "step": 415
+    },
+    {
+      "epoch": 1.0448318804483188,
+      "grad_norm": 1.0091121196746826,
+      "learning_rate": 2.332713611292371e-05,
+      "loss": 0.3109,
+      "step": 420
+    },
+    {
+      "epoch": 1.0572851805728518,
+      "grad_norm": 1.1977916955947876,
+      "learning_rate": 2.3155198304850694e-05,
+      "loss": 0.2953,
+      "step": 425
+    },
+    {
+      "epoch": 1.0697384806973849,
+      "grad_norm": 1.1469180583953857,
+      "learning_rate": 2.2981725680150745e-05,
+      "loss": 0.2876,
+      "step": 430
+    },
+    {
+      "epoch": 1.0821917808219177,
+      "grad_norm": 1.2730803489685059,
+      "learning_rate": 2.2806750886548508e-05,
+      "loss": 0.323,
+      "step": 435
+    },
+    {
+      "epoch": 1.0946450809464507,
+      "grad_norm": 1.2842358350753784,
+      "learning_rate": 2.2630306854478335e-05,
+      "loss": 0.312,
+      "step": 440
+    },
+    {
+      "epoch": 1.1070983810709838,
+      "grad_norm": 1.187873125076294,
+      "learning_rate": 2.245242679088679e-05,
+      "loss": 0.2954,
+      "step": 445
+    },
+    {
+      "epoch": 1.1195516811955168,
+      "grad_norm": 1.6430953741073608,
+      "learning_rate": 2.2273144172982985e-05,
+      "loss": 0.301,
+      "step": 450
+    },
+    {
+      "epoch": 1.13200498132005,
+      "grad_norm": 1.2372331619262695,
+      "learning_rate": 2.2092492741938222e-05,
+      "loss": 0.2978,
+      "step": 455
+    },
+    {
+      "epoch": 1.1444582814445827,
+      "grad_norm": 1.7149838209152222,
+      "learning_rate": 2.1910506496535816e-05,
+      "loss": 0.2622,
+      "step": 460
+    },
+    {
+      "epoch": 1.1569115815691158,
+      "grad_norm": 1.116036057472229,
+      "learning_rate": 2.1727219686772494e-05,
+      "loss": 0.273,
+      "step": 465
+    },
+    {
+      "epoch": 1.1693648816936488,
+      "grad_norm": 1.1681700944900513,
+      "learning_rate": 2.154266680741253e-05,
+      "loss": 0.285,
+      "step": 470
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 1.3372962474822998,
+      "learning_rate": 2.1356882591495795e-05,
+      "loss": 0.2748,
+      "step": 475
+    },
+    {
+      "epoch": 1.1942714819427147,
+      "grad_norm": 1.126945972442627,
+      "learning_rate": 2.116990200380093e-05,
+      "loss": 0.3039,
+      "step": 480
+    },
+    {
+      "epoch": 1.2067247820672478,
+      "grad_norm": 1.4096333980560303,
+      "learning_rate": 2.0981760234264983e-05,
+      "loss": 0.2728,
+      "step": 485
+    },
+    {
+      "epoch": 1.2191780821917808,
+      "grad_norm": 1.2791043519973755,
+      "learning_rate": 2.07924926913606e-05,
+      "loss": 0.2793,
+      "step": 490
+    },
+    {
+      "epoch": 1.2316313823163139,
+      "grad_norm": 1.4474986791610718,
+      "learning_rate": 2.0602134995432124e-05,
+      "loss": 0.2218,
+      "step": 495
+    },
+    {
+      "epoch": 1.244084682440847,
+      "grad_norm": 1.5396515130996704,
+      "learning_rate": 2.0410722971991802e-05,
+      "loss": 0.2663,
+      "step": 500
+    },
+    {
+      "epoch": 1.25653798256538,
+      "grad_norm": 1.1788588762283325,
+      "learning_rate": 2.0218292644977396e-05,
+      "loss": 0.2495,
+      "step": 505
+    },
+    {
+      "epoch": 1.2689912826899128,
+      "grad_norm": 1.346476435661316,
+      "learning_rate": 2.002488022997244e-05,
+      "loss": 0.2553,
+      "step": 510
+    },
+    {
+      "epoch": 1.2814445828144458,
+      "grad_norm": 1.3967148065567017,
+      "learning_rate": 1.9830522127390428e-05,
+      "loss": 0.2351,
+      "step": 515
+    },
+    {
+      "epoch": 1.293897882938979,
+      "grad_norm": 1.3045532703399658,
+      "learning_rate": 1.963525491562421e-05,
+      "loss": 0.2415,
+      "step": 520
+    },
+    {
+      "epoch": 1.3063511830635117,
+      "grad_norm": 1.0762789249420166,
+      "learning_rate": 1.943911534416193e-05,
+      "loss": 0.2423,
+      "step": 525
+    },
+    {
+      "epoch": 1.3188044831880448,
+      "grad_norm": 1.443464756011963,
+      "learning_rate": 1.924214032667069e-05,
+      "loss": 0.2234,
+      "step": 530
+    },
+    {
+      "epoch": 1.3312577833125778,
+      "grad_norm": 1.273149013519287,
+      "learning_rate": 1.9044366934049408e-05,
+      "loss": 0.2725,
+      "step": 535
+    },
+    {
+      "epoch": 1.3437110834371109,
+      "grad_norm": 1.1968226432800293,
+      "learning_rate": 1.8845832387451995e-05,
+      "loss": 0.209,
+      "step": 540
+    },
+    {
+      "epoch": 1.356164383561644,
+      "grad_norm": 1.4278267621994019,
+      "learning_rate": 1.8646574051282337e-05,
+      "loss": 0.2271,
+      "step": 545
+    },
+    {
+      "epoch": 1.3686176836861768,
+      "grad_norm": 1.155118703842163,
+      "learning_rate": 1.844662942616224e-05,
+      "loss": 0.2143,
+      "step": 550
+    },
+    {
+      "epoch": 1.3810709838107098,
+      "grad_norm": 1.3056758642196655,
+      "learning_rate": 1.8246036141873786e-05,
+      "loss": 0.2258,
+      "step": 555
+    },
+    {
+      "epoch": 1.3935242839352429,
+      "grad_norm": 1.4385669231414795,
+      "learning_rate": 1.804483195027739e-05,
+      "loss": 0.2225,
+      "step": 560
+    },
+    {
+      "epoch": 1.405977584059776,
+      "grad_norm": 1.2106066942214966,
+      "learning_rate": 1.7843054718206818e-05,
+      "loss": 0.1869,
+      "step": 565
+    },
+    {
+      "epoch": 1.4184308841843087,
+      "grad_norm": 1.2372570037841797,
+      "learning_rate": 1.7640742420342672e-05,
+      "loss": 0.2307,
+      "step": 570
+    },
+    {
+      "epoch": 1.4308841843088418,
+      "grad_norm": 1.3531502485275269,
+      "learning_rate": 1.7437933132065452e-05,
+      "loss": 0.2345,
+      "step": 575
+    },
+    {
+      "epoch": 1.4433374844333748,
+      "grad_norm": 1.2152862548828125,
+      "learning_rate": 1.7234665022289777e-05,
+      "loss": 0.2211,
+      "step": 580
+    },
+    {
+      "epoch": 1.455790784557908,
+      "grad_norm": 1.3017277717590332,
+      "learning_rate": 1.7030976346280924e-05,
+      "loss": 0.1936,
+      "step": 585
+    },
+    {
+      "epoch": 1.468244084682441,
+      "grad_norm": 1.1975960731506348,
+      "learning_rate": 1.6826905438455174e-05,
+      "loss": 0.195,
+      "step": 590
+    },
+    {
+      "epoch": 1.4806973848069738,
+      "grad_norm": 1.2248194217681885,
+      "learning_rate": 1.662249070516523e-05,
+      "loss": 0.1987,
+      "step": 595
+    },
+    {
+      "epoch": 1.4931506849315068,
+      "grad_norm": 1.175775408744812,
+      "learning_rate": 1.641777061747209e-05,
+      "loss": 0.1816,
+      "step": 600
+    },
+    {
+      "epoch": 1.5056039850560399,
+      "grad_norm": 1.116485357284546,
+      "learning_rate": 1.621278370390476e-05,
+      "loss": 0.1969,
+      "step": 605
+    },
+    {
+      "epoch": 1.5180572851805727,
+      "grad_norm": 1.2391877174377441,
+      "learning_rate": 1.6007568543209153e-05,
+      "loss": 0.185,
+      "step": 610
+    },
+    {
+      "epoch": 1.5305105853051058,
+      "grad_norm": 1.2541625499725342,
+      "learning_rate": 1.5802163757087513e-05,
+      "loss": 0.1798,
+      "step": 615
+    },
+    {
+      "epoch": 1.5429638854296388,
+      "grad_norm": 1.1040685176849365,
+      "learning_rate": 1.5596608002929793e-05,
+      "loss": 0.1657,
+      "step": 620
+    },
+    {
+      "epoch": 1.5554171855541719,
+      "grad_norm": 1.1332441568374634,
+      "learning_rate": 1.539093996653829e-05,
+      "loss": 0.1842,
+      "step": 625
+    },
+    {
+      "epoch": 1.567870485678705,
+      "grad_norm": 1.1989057064056396,
+      "learning_rate": 1.518519835484691e-05,
+      "loss": 0.1726,
+      "step": 630
+    },
+    {
+      "epoch": 1.580323785803238,
+      "grad_norm": 1.2308235168457031,
+      "learning_rate": 1.4979421888636532e-05,
+      "loss": 0.1717,
+      "step": 635
+    },
+    {
+      "epoch": 1.592777085927771,
+      "grad_norm": 1.0793812274932861,
+      "learning_rate": 1.4773649295247668e-05,
+      "loss": 0.1885,
+      "step": 640
+    },
+    {
+      "epoch": 1.6052303860523038,
+      "grad_norm": 1.2486377954483032,
+      "learning_rate": 1.4567919301291976e-05,
+      "loss": 0.1866,
+      "step": 645
+    },
+    {
+      "epoch": 1.6176836861768369,
+      "grad_norm": 1.1979202032089233,
+      "learning_rate": 1.4362270625363852e-05,
+      "loss": 0.1766,
+      "step": 650
+    },
+    {
+      "epoch": 1.6301369863013697,
+      "grad_norm": 1.1210886240005493,
+      "learning_rate": 1.415674197075355e-05,
+      "loss": 0.1752,
+      "step": 655
+    },
+    {
+      "epoch": 1.6425902864259028,
+      "grad_norm": 1.15273916721344,
+      "learning_rate": 1.3951372018163197e-05,
+      "loss": 0.1524,
+      "step": 660
+    },
+    {
+      "epoch": 1.6550435865504358,
+      "grad_norm": 1.2123425006866455,
+      "learning_rate": 1.3746199418427044e-05,
+      "loss": 0.1681,
+      "step": 665
+    },
+    {
+      "epoch": 1.6674968866749689,
+      "grad_norm": 1.0621817111968994,
+      "learning_rate": 1.3541262785237321e-05,
+      "loss": 0.1727,
+      "step": 670
+    },
+    {
+      "epoch": 1.679950186799502,
+      "grad_norm": 1.2401793003082275,
+      "learning_rate": 1.3336600687877124e-05,
+      "loss": 0.1589,
+      "step": 675
+    },
+    {
+      "epoch": 1.692403486924035,
+      "grad_norm": 1.0771820545196533,
+      "learning_rate": 1.313225164396162e-05,
+      "loss": 0.1615,
+      "step": 680
+    },
+    {
+      "epoch": 1.704856787048568,
+      "grad_norm": 1.0819141864776611,
+      "learning_rate": 1.2928254112189e-05,
+      "loss": 0.1611,
+      "step": 685
+    },
+    {
+      "epoch": 1.7173100871731009,
+      "grad_norm": 1.2320258617401123,
+      "learning_rate": 1.272464648510251e-05,
+      "loss": 0.1418,
+      "step": 690
+    },
+    {
+      "epoch": 1.729763387297634,
+      "grad_norm": 1.4835878610610962,
+      "learning_rate": 1.2521467081864945e-05,
+      "loss": 0.1493,
+      "step": 695
+    },
+    {
+      "epoch": 1.7422166874221667,
+      "grad_norm": 1.1545857191085815,
+      "learning_rate": 1.2318754141046936e-05,
+      "loss": 0.145,
+      "step": 700
+    },
+    {
+      "epoch": 1.7546699875466998,
+      "grad_norm": 1.1143786907196045,
+      "learning_rate": 1.211654581343039e-05,
+      "loss": 0.1404,
+      "step": 705
+    },
+    {
+      "epoch": 1.7671232876712328,
+      "grad_norm": 1.0791460275650024,
+      "learning_rate": 1.1914880154828514e-05,
+      "loss": 0.1326,
+      "step": 710
+    },
+    {
+      "epoch": 1.7795765877957659,
+      "grad_norm": 1.2453776597976685,
+      "learning_rate": 1.1713795118923659e-05,
+      "loss": 0.1572,
+      "step": 715
+    },
+    {
+      "epoch": 1.792029887920299,
+      "grad_norm": 1.0310232639312744,
+      "learning_rate": 1.1513328550124379e-05,
+      "loss": 0.1338,
+      "step": 720
+    },
+    {
+      "epoch": 1.804483188044832,
+      "grad_norm": 1.0175222158432007,
+      "learning_rate": 1.1313518176443099e-05,
+      "loss": 0.1306,
+      "step": 725
+    },
+    {
+      "epoch": 1.816936488169365,
+      "grad_norm": 1.0287436246871948,
+      "learning_rate": 1.1114401602395647e-05,
+      "loss": 0.1513,
+      "step": 730
+    },
+    {
+      "epoch": 1.8293897882938979,
+      "grad_norm": 1.144381046295166,
+      "learning_rate": 1.0916016301924056e-05,
+      "loss": 0.1569,
+      "step": 735
+    },
+    {
+      "epoch": 1.841843088418431,
+      "grad_norm": 1.124574899673462,
+      "learning_rate": 1.071839961134393e-05,
+      "loss": 0.1287,
+      "step": 740
+    },
+    {
+      "epoch": 1.8542963885429637,
+      "grad_norm": 1.0954211950302124,
+      "learning_rate": 1.0521588722317707e-05,
+      "loss": 0.1388,
+      "step": 745
+    },
+    {
+      "epoch": 1.8667496886674968,
+      "grad_norm": 1.1026991605758667,
+      "learning_rate": 1.0325620674855147e-05,
+      "loss": 0.134,
+      "step": 750
+    },
+    {
+      "epoch": 1.8792029887920298,
+      "grad_norm": 0.9430744647979736,
+      "learning_rate": 1.0130532350342381e-05,
+      "loss": 0.1148,
+      "step": 755
+    },
+    {
+      "epoch": 1.891656288916563,
+      "grad_norm": 0.9424226880073547,
+      "learning_rate": 9.936360464600769e-06,
+      "loss": 0.1314,
+      "step": 760
+    },
+    {
+      "epoch": 1.904109589041096,
+      "grad_norm": 0.9517638683319092,
+      "learning_rate": 9.74314156097697e-06,
+      "loss": 0.1132,
+      "step": 765
+    },
+    {
+      "epoch": 1.916562889165629,
+      "grad_norm": 1.1850603818893433,
+      "learning_rate": 9.550912003465442e-06,
+      "loss": 0.1181,
+      "step": 770
+    },
+    {
+      "epoch": 1.929016189290162,
+      "grad_norm": 0.9696543216705322,
+      "learning_rate": 9.359707969864688e-06,
+      "loss": 0.1112,
+      "step": 775
+    },
+    {
+      "epoch": 1.9414694894146949,
+      "grad_norm": 1.0155787467956543,
+      "learning_rate": 9.16956544496857e-06,
+      "loss": 0.1213,
+      "step": 780
+    },
+    {
+      "epoch": 1.953922789539228,
+      "grad_norm": 1.0327125787734985,
+      "learning_rate": 8.980520213793934e-06,
+      "loss": 0.1327,
+      "step": 785
+    },
+    {
+      "epoch": 1.9663760896637608,
+      "grad_norm": 0.8658395409584045,
+      "learning_rate": 8.792607854845829e-06,
+      "loss": 0.1132,
+      "step": 790
+    },
+    {
+      "epoch": 1.9788293897882938,
+      "grad_norm": 0.8833078742027283,
+      "learning_rate": 8.605863733421594e-06,
+      "loss": 0.1127,
+      "step": 795
+    },
+    {
+      "epoch": 1.9912826899128269,
+      "grad_norm": 1.1244515180587769,
+      "learning_rate": 8.420322994955074e-06,
+      "loss": 0.1059,
+      "step": 800
+    },
+    {
+      "epoch": 2.0024906600249066,
+      "grad_norm": 0.8100209832191467,
+      "learning_rate": 8.236020558402222e-06,
+      "loss": 0.1078,
+      "step": 805
+    },
+    {
+      "epoch": 2.0149439601494397,
+      "grad_norm": 0.6915018558502197,
+      "learning_rate": 8.052991109669306e-06,
+      "loss": 0.0878,
+      "step": 810
+    },
+    {
+      "epoch": 2.0273972602739727,
+      "grad_norm": 0.9379139542579651,
+      "learning_rate": 7.87126909508499e-06,
+      "loss": 0.0898,
+      "step": 815
+    },
+    {
+      "epoch": 2.0398505603985058,
+      "grad_norm": 0.8109155297279358,
+      "learning_rate": 7.690888714917507e-06,
+      "loss": 0.0984,
+      "step": 820
+    },
+    {
+      "epoch": 2.052303860523039,
+      "grad_norm": 0.9438534379005432,
+      "learning_rate": 7.511883916938109e-06,
+      "loss": 0.0869,
+      "step": 825
+    },
+    {
+      "epoch": 2.0647571606475714,
+      "grad_norm": 0.8569890856742859,
+      "learning_rate": 7.334288390032098e-06,
+      "loss": 0.0882,
+      "step": 830
+    },
+    {
+      "epoch": 2.0772104607721045,
+      "grad_norm": 0.9603305459022522,
+      "learning_rate": 7.158135557858515e-06,
+      "loss": 0.0887,
+      "step": 835
+    },
+    {
+      "epoch": 2.0896637608966375,
+      "grad_norm": 0.829105019569397,
+      "learning_rate": 6.983458572559782e-06,
+      "loss": 0.0886,
+      "step": 840
+    },
+    {
+      "epoch": 2.1021170610211706,
+      "grad_norm": 0.6731156706809998,
+      "learning_rate": 6.81029030852244e-06,
+      "loss": 0.0807,
+      "step": 845
+    },
+    {
+      "epoch": 2.1145703611457036,
+      "grad_norm": 0.7731700539588928,
+      "learning_rate": 6.63866335619015e-06,
+      "loss": 0.0823,
+      "step": 850
+    },
+    {
+      "epoch": 2.1270236612702367,
+      "grad_norm": 0.8372194766998291,
+      "learning_rate": 6.468610015930143e-06,
+      "loss": 0.087,
+      "step": 855
+    },
+    {
+      "epoch": 2.1394769613947697,
+      "grad_norm": 0.7078593373298645,
+      "learning_rate": 6.3001622919542495e-06,
+      "loss": 0.0917,
+      "step": 860
+    },
+    {
+      "epoch": 2.151930261519303,
+      "grad_norm": 0.8801110982894897,
+      "learning_rate": 6.133351886295691e-06,
+      "loss": 0.0887,
+      "step": 865
+    },
+    {
+      "epoch": 2.1643835616438354,
+      "grad_norm": 0.6602491736412048,
+      "learning_rate": 5.9682101928426966e-06,
+      "loss": 0.0769,
+      "step": 870
+    },
+    {
+      "epoch": 2.1768368617683684,
+      "grad_norm": 0.8794203996658325,
+      "learning_rate": 5.804768291430174e-06,
+      "loss": 0.089,
+      "step": 875
+    },
+    {
+      "epoch": 2.1892901618929015,
+      "grad_norm": 0.712538480758667,
+      "learning_rate": 5.643056941990433e-06,
+      "loss": 0.0779,
+      "step": 880
+    },
+    {
+      "epoch": 2.2017434620174345,
+      "grad_norm": 0.9360284805297852,
+      "learning_rate": 5.483106578764136e-06,
+      "loss": 0.0772,
+      "step": 885
+    },
+    {
+      "epoch": 2.2141967621419676,
+      "grad_norm": 1.0328861474990845,
+      "learning_rate": 5.324947304572553e-06,
+      "loss": 0.083,
+      "step": 890
+    },
+    {
+      "epoch": 2.2266500622665006,
+      "grad_norm": 0.7821430563926697,
+      "learning_rate": 5.1686088851521685e-06,
+      "loss": 0.0824,
+      "step": 895
+    },
+    {
+      "epoch": 2.2391033623910337,
+      "grad_norm": 0.7114161252975464,
+      "learning_rate": 5.014120743552749e-06,
+      "loss": 0.0834,
+      "step": 900
+    },
+    {
+      "epoch": 2.2515566625155667,
+      "grad_norm": 0.8093134760856628,
+      "learning_rate": 4.861511954599883e-06,
+      "loss": 0.0754,
+      "step": 905
+    },
+    {
+      "epoch": 2.2640099626401,
+      "grad_norm": 1.0690568685531616,
+      "learning_rate": 4.710811239423083e-06,
+      "loss": 0.0743,
+      "step": 910
+    },
+    {
+      "epoch": 2.276463262764633,
+      "grad_norm": 0.9506732225418091,
+      "learning_rate": 4.5620469600504355e-06,
+      "loss": 0.0751,
+      "step": 915
+    },
+    {
+      "epoch": 2.2889165628891655,
+      "grad_norm": 0.6726131439208984,
+      "learning_rate": 4.415247114070834e-06,
+      "loss": 0.0701,
+      "step": 920
+    },
+    {
+      "epoch": 2.3013698630136985,
+      "grad_norm": 0.6878358721733093,
+      "learning_rate": 4.270439329364799e-06,
+      "loss": 0.0721,
+      "step": 925
+    },
+    {
+      "epoch": 2.3138231631382316,
+      "grad_norm": 0.8972549438476562,
+      "learning_rate": 4.1276508589048986e-06,
+      "loss": 0.0759,
+      "step": 930
+    },
+    {
+      "epoch": 2.3262764632627646,
+      "grad_norm": 0.6929914355278015,
+      "learning_rate": 3.986908575626699e-06,
+      "loss": 0.0799,
+      "step": 935
+    },
+    {
+      "epoch": 2.3387297633872977,
+      "grad_norm": 0.9535344243049622,
+      "learning_rate": 3.848238967371265e-06,
+      "loss": 0.0676,
+      "step": 940
+    },
+    {
+      "epoch": 2.3511830635118307,
+      "grad_norm": 0.6658231616020203,
+      "learning_rate": 3.7116681319001018e-06,
+      "loss": 0.0728,
+      "step": 945
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 0.7737366557121277,
+      "learning_rate": 3.5772217719835384e-06,
+      "loss": 0.0831,
+      "step": 950
+    },
+    {
+      "epoch": 2.376089663760897,
+      "grad_norm": 0.5599685907363892,
+      "learning_rate": 3.444925190563445e-06,
+      "loss": 0.0648,
+      "step": 955
+    },
+    {
+      "epoch": 2.3885429638854294,
+      "grad_norm": 0.6603100299835205,
+      "learning_rate": 3.3148032859911844e-06,
+      "loss": 0.0734,
+      "step": 960
+    },
+    {
+      "epoch": 2.4009962640099625,
+      "grad_norm": 0.7781059145927429,
+      "learning_rate": 3.186880547341727e-06,
+      "loss": 0.0725,
+      "step": 965
+    },
+    {
+      "epoch": 2.4134495641344955,
+      "grad_norm": 0.7224042415618896,
+      "learning_rate": 3.0611810498047742e-06,
+      "loss": 0.0774,
+      "step": 970
+    },
+    {
+      "epoch": 2.4259028642590286,
+      "grad_norm": 0.631102442741394,
+      "learning_rate": 2.937728450153789e-06,
+      "loss": 0.0713,
+      "step": 975
+    },
+    {
+      "epoch": 2.4383561643835616,
+      "grad_norm": 0.8373669385910034,
+      "learning_rate": 2.816545982293752e-06,
+      "loss": 0.0677,
+      "step": 980
+    },
+    {
+      "epoch": 2.4508094645080947,
+      "grad_norm": 0.7206551432609558,
+      "learning_rate": 2.6976564528885422e-06,
+      "loss": 0.0707,
+      "step": 985
+    },
+    {
+      "epoch": 2.4632627646326277,
+      "grad_norm": 0.8052924275398254,
+      "learning_rate": 2.5810822370686804e-06,
+      "loss": 0.0711,
+      "step": 990
+    },
+    {
+      "epoch": 2.4757160647571608,
+      "grad_norm": 0.6548148989677429,
+      "learning_rate": 2.466845274220316e-06,
+      "loss": 0.073,
+      "step": 995
+    },
+    {
+      "epoch": 2.488169364881694,
+      "grad_norm": 0.7128301858901978,
+      "learning_rate": 2.3549670638562016e-06,
+      "loss": 0.0632,
+      "step": 1000
+    },
+    {
+      "epoch": 2.500622665006227,
+      "grad_norm": 0.6294408440589905,
+      "learning_rate": 2.2454686615694785e-06,
+      "loss": 0.065,
+      "step": 1005
+    },
+    {
+      "epoch": 2.51307596513076,
+      "grad_norm": 0.5605584979057312,
+      "learning_rate": 2.138370675070977e-06,
+      "loss": 0.0722,
+      "step": 1010
+    },
+    {
+      "epoch": 2.5255292652552925,
+      "grad_norm": 0.6864460706710815,
+      "learning_rate": 2.0336932603108355e-06,
+      "loss": 0.0667,
+      "step": 1015
+    },
+    {
+      "epoch": 2.5379825653798256,
+      "grad_norm": 0.6694821119308472,
+      "learning_rate": 1.9314561176851235e-06,
+      "loss": 0.0692,
+      "step": 1020
+    },
+    {
+      "epoch": 2.5504358655043586,
+      "grad_norm": 0.6386405229568481,
+      "learning_rate": 1.8316784883282105e-06,
+      "loss": 0.0703,
+      "step": 1025
+    },
+    {
+      "epoch": 2.5628891656288917,
+      "grad_norm": 0.7209354043006897,
+      "learning_rate": 1.7343791504915684e-06,
+      "loss": 0.0733,
+      "step": 1030
+    },
+    {
+      "epoch": 2.5753424657534247,
+      "grad_norm": 0.8404210209846497,
+      "learning_rate": 1.6395764160096678e-06,
+      "loss": 0.0713,
+      "step": 1035
+    },
+    {
+      "epoch": 2.587795765877958,
+      "grad_norm": 0.7585222721099854,
+      "learning_rate": 1.547288126853697e-06,
+      "loss": 0.0717,
+      "step": 1040
+    },
+    {
+      "epoch": 2.6002490660024904,
+      "grad_norm": 0.5720264315605164,
+      "learning_rate": 1.4575316517736714e-06,
+      "loss": 0.07,
+      "step": 1045
+    },
+    {
+      "epoch": 2.6127023661270234,
+      "grad_norm": 0.7315094470977783,
+      "learning_rate": 1.370323883029615e-06,
+      "loss": 0.0765,
+      "step": 1050
+    },
+    {
+      "epoch": 2.6251556662515565,
+      "grad_norm": 0.6323287487030029,
+      "learning_rate": 1.2856812332124274e-06,
+      "loss": 0.0682,
+      "step": 1055
+    },
+    {
+      "epoch": 2.6376089663760895,
+      "grad_norm": 0.5976901054382324,
+      "learning_rate": 1.2036196321550096e-06,
+      "loss": 0.0607,
+      "step": 1060
+    },
+    {
+      "epoch": 2.6500622665006226,
+      "grad_norm": 0.6853218674659729,
+      "learning_rate": 1.1241545239342609e-06,
+      "loss": 0.0673,
+      "step": 1065
+    },
+    {
+      "epoch": 2.6625155666251556,
+      "grad_norm": 0.5944624543190002,
+      "learning_rate": 1.0473008639644814e-06,
+      "loss": 0.0596,
+      "step": 1070
+    },
+    {
+      "epoch": 2.6749688667496887,
+      "grad_norm": 0.5547900795936584,
+      "learning_rate": 9.730731161827528e-07,
+      "loss": 0.0631,
+      "step": 1075
+    },
+    {
+      "epoch": 2.6874221668742218,
+      "grad_norm": 0.7503756880760193,
+      "learning_rate": 9.014852503268045e-07,
+      "loss": 0.0637,
+      "step": 1080
+    },
+    {
+      "epoch": 2.699875466998755,
+      "grad_norm": 0.6030451655387878,
+      "learning_rate": 8.325507393059101e-07,
+      "loss": 0.0573,
+      "step": 1085
+    },
+    {
+      "epoch": 2.712328767123288,
+      "grad_norm": 0.6738685965538025,
+      "learning_rate": 7.662825566652442e-07,
+      "loss": 0.0662,
+      "step": 1090
+    },
+    {
+      "epoch": 2.724782067247821,
+      "grad_norm": 0.6543871760368347,
+      "learning_rate": 7.026931741442783e-07,
+      "loss": 0.072,
+      "step": 1095
+    },
+    {
+      "epoch": 2.7372353673723535,
+      "grad_norm": 0.5266416072845459,
+      "learning_rate": 6.417945593295638e-07,
+      "loss": 0.0735,
+      "step": 1100
+    },
+    {
+      "epoch": 2.7496886674968866,
+      "grad_norm": 0.7614601850509644,
+      "learning_rate": 5.835981734024348e-07,
+      "loss": 0.0746,
+      "step": 1105
+    },
+    {
+      "epoch": 2.7621419676214196,
+      "grad_norm": 0.7067199945449829,
+      "learning_rate": 5.281149689819981e-07,
+      "loss": 0.0645,
+      "step": 1110
+    },
+    {
+      "epoch": 2.7745952677459527,
+      "grad_norm": 0.7431591153144836,
+      "learning_rate": 4.7535538806383006e-07,
+      "loss": 0.0679,
+      "step": 1115
+    },
+    {
+      "epoch": 2.7870485678704857,
+      "grad_norm": 0.5334625840187073,
+      "learning_rate": 4.2532936005479585e-07,
+      "loss": 0.0655,
+      "step": 1120
+    },
+    {
+      "epoch": 2.7995018679950188,
+      "grad_norm": 0.5616161823272705,
+      "learning_rate": 3.7804629990431884e-07,
+      "loss": 0.0618,
+      "step": 1125
+    },
+    {
+      "epoch": 2.811955168119552,
+      "grad_norm": 0.7120090126991272,
+      "learning_rate": 3.335151063324765e-07,
+      "loss": 0.0665,
+      "step": 1130
+    },
+    {
+      "epoch": 2.8244084682440844,
+      "grad_norm": 0.5271226763725281,
+      "learning_rate": 2.917441601552534e-07,
+      "loss": 0.0631,
+      "step": 1135
+    },
+    {
+      "epoch": 2.8368617683686175,
+      "grad_norm": 0.7041694521903992,
+      "learning_rate": 2.527413227072628e-07,
+      "loss": 0.0778,
+      "step": 1140
+    },
+    {
+      "epoch": 2.8493150684931505,
+      "grad_norm": 0.6397244930267334,
+      "learning_rate": 2.165139343622352e-07,
+      "loss": 0.0796,
+      "step": 1145
+    },
+    {
+      "epoch": 2.8617683686176836,
+      "grad_norm": 0.6322922110557556,
+      "learning_rate": 1.830688131515551e-07,
+      "loss": 0.0741,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8742216687422166,
+      "grad_norm": 0.6011943817138672,
+      "learning_rate": 1.5241225348109898e-07,
+      "loss": 0.0698,
+      "step": 1155
+    },
+    {
+      "epoch": 2.8866749688667497,
+      "grad_norm": 0.7942560315132141,
+      "learning_rate": 1.2455002494661972e-07,
+      "loss": 0.0747,
+      "step": 1160
+    },
+    {
+      "epoch": 2.8991282689912827,
+      "grad_norm": 0.6241059303283691,
+      "learning_rate": 9.948737124790331e-08,
+      "loss": 0.0633,
+      "step": 1165
+    },
+    {
+      "epoch": 2.911581569115816,
+      "grad_norm": 0.6511146426200867,
+      "learning_rate": 7.722900920190179e-08,
+      "loss": 0.0669,
+      "step": 1170
+    },
+    {
+      "epoch": 2.924034869240349,
+      "grad_norm": 0.5726321339607239,
+      "learning_rate": 5.777912785502493e-08,
+      "loss": 0.0674,
+      "step": 1175
+    },
+    {
+      "epoch": 2.936488169364882,
+      "grad_norm": 0.6145641207695007,
+      "learning_rate": 4.114138769474918e-08,
+      "loss": 0.0703,
+      "step": 1180
+    },
+    {
+      "epoch": 2.948941469489415,
+      "grad_norm": 0.6198668479919434,
+      "learning_rate": 2.731891996071878e-08,
+      "loss": 0.0641,
+      "step": 1185
+    },
+    {
+      "epoch": 2.9613947696139475,
+      "grad_norm": 0.6175679564476013,
+      "learning_rate": 1.6314326055440475e-08,
+      "loss": 0.0652,
+      "step": 1190
+    },
+    {
+      "epoch": 2.9738480697384806,
+      "grad_norm": 0.673772394657135,
+      "learning_rate": 8.129677054693474e-09,
+      "loss": 0.0678,
+      "step": 1195
+    },
+    {
+      "epoch": 2.9863013698630136,
+      "grad_norm": 0.668915331363678,
+      "learning_rate": 2.7665133177545708e-09,
+      "loss": 0.0644,
+      "step": 1200
+    },
+    {
+      "epoch": 2.9987546699875467,
+      "grad_norm": 0.7095440030097961,
+      "learning_rate": 2.2584419750504293e-10,
+      "loss": 0.0753,
+      "step": 1205
+    },
+    {
+      "epoch": 3.0,
+      "step": 1206,
+      "total_flos": 1.5419277279871631e+18,
+      "train_loss": 0.36778768146438384,
+      "train_runtime": 743.0643,
+      "train_samples_per_second": 51.852,
+      "train_steps_per_second": 1.623
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1206,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5419277279871631e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

138_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e49dd4d82a92b5d2764b301e241a56c5b0d9670ccf0c3807bca00aaaf826521c
+size 8273

138_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff