Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

125_128_e3_3e-5/adapter_config.json +39 -0
125_128_e3_3e-5/adapter_model.safetensors +3 -0
125_128_e3_3e-5/added_tokens.json +9 -0
125_128_e3_3e-5/all_results.json +9 -0
125_128_e3_3e-5/chat_template.jinja +62 -0
125_128_e3_3e-5/config.json +32 -0
125_128_e3_3e-5/merges.txt +0 -0
125_128_e3_3e-5/special_tokens_map.json +33 -0
125_128_e3_3e-5/tokenizer.json +0 -0
125_128_e3_3e-5/tokenizer_config.json +234 -0
125_128_e3_3e-5/train_results.json +9 -0
125_128_e3_3e-5/trainer_state.json +1177 -0
125_128_e3_3e-5/training_args.bin +3 -0
125_128_e3_3e-5/vocab.json +0 -0

125_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

125_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df90e0c7d6f0469ed801ed4e5e1b8be4b68584519c6dadc0dcf42c3b3c62b96c
+size 791751704

125_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

125_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.140044753493033e+18,
+    "train_loss": 0.6133637193677464,
+    "train_runtime": 530.5624,
+    "train_samples": 8646,
+    "train_samples_per_second": 48.888,
+    "train_steps_per_second": 1.532
+}

125_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

125_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

125_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

125_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

125_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

125_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

125_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.140044753493033e+18,
+    "train_loss": 0.6133637193677464,
+    "train_runtime": 530.5624,
+    "train_samples": 8646,
+    "train_samples_per_second": 48.888,
+    "train_steps_per_second": 1.532
+}

125_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1177 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 813,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.018484288354898338,
+      "grad_norm": 3.5039896965026855,
+      "learning_rate": 2.9268292682926833e-06,
+      "loss": 1.6459,
+      "step": 5
+    },
+    {
+      "epoch": 0.036968576709796676,
+      "grad_norm": 1.7820426225662231,
+      "learning_rate": 6.585365853658537e-06,
+      "loss": 1.5072,
+      "step": 10
+    },
+    {
+      "epoch": 0.05545286506469501,
+      "grad_norm": 0.6207470893859863,
+      "learning_rate": 1.024390243902439e-05,
+      "loss": 1.3848,
+      "step": 15
+    },
+    {
+      "epoch": 0.07393715341959335,
+      "grad_norm": 0.46800681948661804,
+      "learning_rate": 1.3902439024390245e-05,
+      "loss": 1.368,
+      "step": 20
+    },
+    {
+      "epoch": 0.09242144177449169,
+      "grad_norm": 0.4362355172634125,
+      "learning_rate": 1.7560975609756096e-05,
+      "loss": 1.3688,
+      "step": 25
+    },
+    {
+      "epoch": 0.11090573012939002,
+      "grad_norm": 0.3878526985645294,
+      "learning_rate": 2.121951219512195e-05,
+      "loss": 1.3381,
+      "step": 30
+    },
+    {
+      "epoch": 0.12939001848428835,
+      "grad_norm": 0.3714441955089569,
+      "learning_rate": 2.4878048780487805e-05,
+      "loss": 1.2626,
+      "step": 35
+    },
+    {
+      "epoch": 0.1478743068391867,
+      "grad_norm": 0.4207943081855774,
+      "learning_rate": 2.8536585365853658e-05,
+      "loss": 1.2895,
+      "step": 40
+    },
+    {
+      "epoch": 0.16635859519408502,
+      "grad_norm": 0.34457623958587646,
+      "learning_rate": 2.999888220149729e-05,
+      "loss": 1.2639,
+      "step": 45
+    },
+    {
+      "epoch": 0.18484288354898337,
+      "grad_norm": 0.3642303943634033,
+      "learning_rate": 2.9992051813950364e-05,
+      "loss": 1.2372,
+      "step": 50
+    },
+    {
+      "epoch": 0.2033271719038817,
+      "grad_norm": 0.346287339925766,
+      "learning_rate": 2.9979014862310932e-05,
+      "loss": 1.2541,
+      "step": 55
+    },
+    {
+      "epoch": 0.22181146025878004,
+      "grad_norm": 0.3948257267475128,
+      "learning_rate": 2.9959776743750573e-05,
+      "loss": 1.182,
+      "step": 60
+    },
+    {
+      "epoch": 0.24029574861367836,
+      "grad_norm": 0.3917973041534424,
+      "learning_rate": 2.9934345422663842e-05,
+      "loss": 1.2751,
+      "step": 65
+    },
+    {
+      "epoch": 0.2587800369685767,
+      "grad_norm": 0.3516767919063568,
+      "learning_rate": 2.9902731427371096e-05,
+      "loss": 1.1232,
+      "step": 70
+    },
+    {
+      "epoch": 0.27726432532347506,
+      "grad_norm": 0.4123232066631317,
+      "learning_rate": 2.986494784575986e-05,
+      "loss": 1.2038,
+      "step": 75
+    },
+    {
+      "epoch": 0.2957486136783734,
+      "grad_norm": 0.429155558347702,
+      "learning_rate": 2.9821010319866595e-05,
+      "loss": 1.1977,
+      "step": 80
+    },
+    {
+      "epoch": 0.3142329020332717,
+      "grad_norm": 0.41628238558769226,
+      "learning_rate": 2.9770937039401006e-05,
+      "loss": 1.1586,
+      "step": 85
+    },
+    {
+      "epoch": 0.33271719038817005,
+      "grad_norm": 0.45903369784355164,
+      "learning_rate": 2.9714748734215714e-05,
+      "loss": 1.1056,
+      "step": 90
+    },
+    {
+      "epoch": 0.3512014787430684,
+      "grad_norm": 0.443715900182724,
+      "learning_rate": 2.9652468665724275e-05,
+      "loss": 1.0919,
+      "step": 95
+    },
+    {
+      "epoch": 0.36968576709796674,
+      "grad_norm": 0.4556092619895935,
+      "learning_rate": 2.9584122617271202e-05,
+      "loss": 1.0706,
+      "step": 100
+    },
+    {
+      "epoch": 0.38817005545286504,
+      "grad_norm": 0.4700096547603607,
+      "learning_rate": 2.9509738883457885e-05,
+      "loss": 1.0293,
+      "step": 105
+    },
+    {
+      "epoch": 0.4066543438077634,
+      "grad_norm": 0.5135429501533508,
+      "learning_rate": 2.9429348258428933e-05,
+      "loss": 1.0677,
+      "step": 110
+    },
+    {
+      "epoch": 0.42513863216266173,
+      "grad_norm": 0.48664531111717224,
+      "learning_rate": 2.934298402312365e-05,
+      "loss": 1.0507,
+      "step": 115
+    },
+    {
+      "epoch": 0.4436229205175601,
+      "grad_norm": 0.5072259902954102,
+      "learning_rate": 2.9250681931498096e-05,
+      "loss": 1.0144,
+      "step": 120
+    },
+    {
+      "epoch": 0.46210720887245843,
+      "grad_norm": 0.508030354976654,
+      "learning_rate": 2.915248019572327e-05,
+      "loss": 1.0468,
+      "step": 125
+    },
+    {
+      "epoch": 0.4805914972273567,
+      "grad_norm": 0.5494552254676819,
+      "learning_rate": 2.9048419470365656e-05,
+      "loss": 1.0241,
+      "step": 130
+    },
+    {
+      "epoch": 0.49907578558225507,
+      "grad_norm": 0.5094569325447083,
+      "learning_rate": 2.8938542835556595e-05,
+      "loss": 1.0054,
+      "step": 135
+    },
+    {
+      "epoch": 0.5175600739371534,
+      "grad_norm": 0.5758535861968994,
+      "learning_rate": 2.8822895779157576e-05,
+      "loss": 0.9891,
+      "step": 140
+    },
+    {
+      "epoch": 0.5360443622920518,
+      "grad_norm": 0.5975119471549988,
+      "learning_rate": 2.8701526177928673e-05,
+      "loss": 0.9393,
+      "step": 145
+    },
+    {
+      "epoch": 0.5545286506469501,
+      "grad_norm": 0.6076070070266724,
+      "learning_rate": 2.857448427770802e-05,
+      "loss": 0.9536,
+      "step": 150
+    },
+    {
+      "epoch": 0.5730129390018485,
+      "grad_norm": 0.62619549036026,
+      "learning_rate": 2.8441822672610553e-05,
+      "loss": 0.9498,
+      "step": 155
+    },
+    {
+      "epoch": 0.5914972273567468,
+      "grad_norm": 0.6700627207756042,
+      "learning_rate": 2.830359628325454e-05,
+      "loss": 0.9245,
+      "step": 160
+    },
+    {
+      "epoch": 0.609981515711645,
+      "grad_norm": 0.6142323613166809,
+      "learning_rate": 2.8159862334024963e-05,
+      "loss": 0.9367,
+      "step": 165
+    },
+    {
+      "epoch": 0.6284658040665434,
+      "grad_norm": 0.6446525454521179,
+      "learning_rate": 2.8010680329383213e-05,
+      "loss": 0.906,
+      "step": 170
+    },
+    {
+      "epoch": 0.6469500924214417,
+      "grad_norm": 0.7208899855613708,
+      "learning_rate": 2.7856112029232776e-05,
+      "loss": 0.9289,
+      "step": 175
+    },
+    {
+      "epoch": 0.6654343807763401,
+      "grad_norm": 0.6678638458251953,
+      "learning_rate": 2.7696221423351277e-05,
+      "loss": 0.8781,
+      "step": 180
+    },
+    {
+      "epoch": 0.6839186691312384,
+      "grad_norm": 0.6600868105888367,
+      "learning_rate": 2.7531074704899288e-05,
+      "loss": 0.8787,
+      "step": 185
+    },
+    {
+      "epoch": 0.7024029574861368,
+      "grad_norm": 0.7912498116493225,
+      "learning_rate": 2.7360740243017042e-05,
+      "loss": 0.8147,
+      "step": 190
+    },
+    {
+      "epoch": 0.7208872458410351,
+      "grad_norm": 0.740778923034668,
+      "learning_rate": 2.7185288554520242e-05,
+      "loss": 0.8631,
+      "step": 195
+    },
+    {
+      "epoch": 0.7393715341959335,
+      "grad_norm": 0.7866532206535339,
+      "learning_rate": 2.70047922747068e-05,
+      "loss": 0.8979,
+      "step": 200
+    },
+    {
+      "epoch": 0.7578558225508318,
+      "grad_norm": 0.787307620048523,
+      "learning_rate": 2.681932612728652e-05,
+      "loss": 0.8277,
+      "step": 205
+    },
+    {
+      "epoch": 0.7763401109057301,
+      "grad_norm": 0.8095487952232361,
+      "learning_rate": 2.6628966893446215e-05,
+      "loss": 0.85,
+      "step": 210
+    },
+    {
+      "epoch": 0.7948243992606284,
+      "grad_norm": 0.7215648293495178,
+      "learning_rate": 2.643379338006302e-05,
+      "loss": 0.8056,
+      "step": 215
+    },
+    {
+      "epoch": 0.8133086876155268,
+      "grad_norm": 0.8256942629814148,
+      "learning_rate": 2.6233886387079137e-05,
+      "loss": 0.8355,
+      "step": 220
+    },
+    {
+      "epoch": 0.8317929759704251,
+      "grad_norm": 0.8773136734962463,
+      "learning_rate": 2.602932867405142e-05,
+      "loss": 0.8226,
+      "step": 225
+    },
+    {
+      "epoch": 0.8502772643253235,
+      "grad_norm": 0.8400380611419678,
+      "learning_rate": 2.582020492588973e-05,
+      "loss": 0.8036,
+      "step": 230
+    },
+    {
+      "epoch": 0.8687615526802218,
+      "grad_norm": 0.8145400881767273,
+      "learning_rate": 2.5606601717798212e-05,
+      "loss": 0.8011,
+      "step": 235
+    },
+    {
+      "epoch": 0.8872458410351202,
+      "grad_norm": 0.8678431510925293,
+      "learning_rate": 2.538860747943398e-05,
+      "loss": 0.806,
+      "step": 240
+    },
+    {
+      "epoch": 0.9057301293900185,
+      "grad_norm": 0.9908318519592285,
+      "learning_rate": 2.5166312458298102e-05,
+      "loss": 0.7743,
+      "step": 245
+    },
+    {
+      "epoch": 0.9242144177449169,
+      "grad_norm": 0.8587374687194824,
+      "learning_rate": 2.4939808682374028e-05,
+      "loss": 0.7363,
+      "step": 250
+    },
+    {
+      "epoch": 0.9426987060998152,
+      "grad_norm": 0.9143586158752441,
+      "learning_rate": 2.4709189922028876e-05,
+      "loss": 0.7656,
+      "step": 255
+    },
+    {
+      "epoch": 0.9611829944547134,
+      "grad_norm": 0.9411439299583435,
+      "learning_rate": 2.4474551651193418e-05,
+      "loss": 0.7871,
+      "step": 260
+    },
+    {
+      "epoch": 0.9796672828096118,
+      "grad_norm": 0.8218419551849365,
+      "learning_rate": 2.4235991007836798e-05,
+      "loss": 0.6616,
+      "step": 265
+    },
+    {
+      "epoch": 0.9981515711645101,
+      "grad_norm": 0.9323953986167908,
+      "learning_rate": 2.3993606753752356e-05,
+      "loss": 0.6974,
+      "step": 270
+    },
+    {
+      "epoch": 1.0147874306839186,
+      "grad_norm": 0.9499527812004089,
+      "learning_rate": 2.374749923367119e-05,
+      "loss": 0.7013,
+      "step": 275
+    },
+    {
+      "epoch": 1.033271719038817,
+      "grad_norm": 0.9152259826660156,
+      "learning_rate": 2.3497770333720432e-05,
+      "loss": 0.6304,
+      "step": 280
+    },
+    {
+      "epoch": 1.0517560073937153,
+      "grad_norm": 0.9655373692512512,
+      "learning_rate": 2.324452343924337e-05,
+      "loss": 0.637,
+      "step": 285
+    },
+    {
+      "epoch": 1.0702402957486137,
+      "grad_norm": 0.9426562786102295,
+      "learning_rate": 2.29878633919989e-05,
+      "loss": 0.623,
+      "step": 290
+    },
+    {
+      "epoch": 1.088724584103512,
+      "grad_norm": 1.0198928117752075,
+      "learning_rate": 2.2727896446758087e-05,
+      "loss": 0.6409,
+      "step": 295
+    },
+    {
+      "epoch": 1.1072088724584104,
+      "grad_norm": 1.1868976354599,
+      "learning_rate": 2.24647302273157e-05,
+      "loss": 0.632,
+      "step": 300
+    },
+    {
+      "epoch": 1.1256931608133087,
+      "grad_norm": 1.008347988128662,
+      "learning_rate": 2.219847368193501e-05,
+      "loss": 0.5849,
+      "step": 305
+    },
+    {
+      "epoch": 1.144177449168207,
+      "grad_norm": 0.9435726404190063,
+      "learning_rate": 2.1929237038244254e-05,
+      "loss": 0.6285,
+      "step": 310
+    },
+    {
+      "epoch": 1.1626617375231054,
+      "grad_norm": 1.0835498571395874,
+      "learning_rate": 2.1657131757603488e-05,
+      "loss": 0.6097,
+      "step": 315
+    },
+    {
+      "epoch": 1.1811460258780038,
+      "grad_norm": 0.9613402485847473,
+      "learning_rate": 2.1382270488960633e-05,
+      "loss": 0.5645,
+      "step": 320
+    },
+    {
+      "epoch": 1.1996303142329021,
+      "grad_norm": 1.1990424394607544,
+      "learning_rate": 2.110476702221595e-05,
+      "loss": 0.5833,
+      "step": 325
+    },
+    {
+      "epoch": 1.2181146025878005,
+      "grad_norm": 1.0975488424301147,
+      "learning_rate": 2.082473624111407e-05,
+      "loss": 0.5996,
+      "step": 330
+    },
+    {
+      "epoch": 1.2365988909426986,
+      "grad_norm": 1.119141697883606,
+      "learning_rate": 2.0542294075683273e-05,
+      "loss": 0.545,
+      "step": 335
+    },
+    {
+      "epoch": 1.2550831792975972,
+      "grad_norm": 1.006493330001831,
+      "learning_rate": 2.0257557454241584e-05,
+      "loss": 0.5936,
+      "step": 340
+    },
+    {
+      "epoch": 1.2735674676524953,
+      "grad_norm": 1.16887629032135,
+      "learning_rate": 1.9970644254989547e-05,
+      "loss": 0.5628,
+      "step": 345
+    },
+    {
+      "epoch": 1.2920517560073936,
+      "grad_norm": 0.98349529504776,
+      "learning_rate": 1.968167325720983e-05,
+      "loss": 0.5378,
+      "step": 350
+    },
+    {
+      "epoch": 1.310536044362292,
+      "grad_norm": 1.3183979988098145,
+      "learning_rate": 1.939076409209374e-05,
+      "loss": 0.5473,
+      "step": 355
+    },
+    {
+      "epoch": 1.3290203327171903,
+      "grad_norm": 1.0919528007507324,
+      "learning_rate": 1.9098037193215063e-05,
+      "loss": 0.5329,
+      "step": 360
+    },
+    {
+      "epoch": 1.3475046210720887,
+      "grad_norm": 1.0224053859710693,
+      "learning_rate": 1.8803613746671747e-05,
+      "loss": 0.5028,
+      "step": 365
+    },
+    {
+      "epoch": 1.365988909426987,
+      "grad_norm": 1.1292046308517456,
+      "learning_rate": 1.850761564091601e-05,
+      "loss": 0.5078,
+      "step": 370
+    },
+    {
+      "epoch": 1.3844731977818854,
+      "grad_norm": 0.9598371982574463,
+      "learning_rate": 1.8210165416293737e-05,
+      "loss": 0.5743,
+      "step": 375
+    },
+    {
+      "epoch": 1.4029574861367837,
+      "grad_norm": 1.1307282447814941,
+      "learning_rate": 1.791138621431392e-05,
+      "loss": 0.4947,
+      "step": 380
+    },
+    {
+      "epoch": 1.421441774491682,
+      "grad_norm": 0.9821717143058777,
+      "learning_rate": 1.7611401726669313e-05,
+      "loss": 0.5036,
+      "step": 385
+    },
+    {
+      "epoch": 1.4399260628465804,
+      "grad_norm": 1.2720128297805786,
+      "learning_rate": 1.731033614402924e-05,
+      "loss": 0.5159,
+      "step": 390
+    },
+    {
+      "epoch": 1.4584103512014788,
+      "grad_norm": 1.10440993309021,
+      "learning_rate": 1.7008314104625868e-05,
+      "loss": 0.5246,
+      "step": 395
+    },
+    {
+      "epoch": 1.4768946395563771,
+      "grad_norm": 1.1941485404968262,
+      "learning_rate": 1.6705460642655202e-05,
+      "loss": 0.5066,
+      "step": 400
+    },
+    {
+      "epoch": 1.4953789279112755,
+      "grad_norm": 1.0744893550872803,
+      "learning_rate": 1.6401901136514107e-05,
+      "loss": 0.5011,
+      "step": 405
+    },
+    {
+      "epoch": 1.5138632162661736,
+      "grad_norm": 1.0705432891845703,
+      "learning_rate": 1.609776125689492e-05,
+      "loss": 0.4915,
+      "step": 410
+    },
+    {
+      "epoch": 1.5323475046210722,
+      "grad_norm": 1.1643410921096802,
+      "learning_rate": 1.5793166914758937e-05,
+      "loss": 0.4806,
+      "step": 415
+    },
+    {
+      "epoch": 1.5508317929759703,
+      "grad_norm": 0.9927543997764587,
+      "learning_rate": 1.5488244209210554e-05,
+      "loss": 0.4621,
+      "step": 420
+    },
+    {
+      "epoch": 1.5693160813308689,
+      "grad_norm": 1.1083128452301025,
+      "learning_rate": 1.5183119375293437e-05,
+      "loss": 0.5067,
+      "step": 425
+    },
+    {
+      "epoch": 1.587800369685767,
+      "grad_norm": 1.1395068168640137,
+      "learning_rate": 1.487791873173041e-05,
+      "loss": 0.5153,
+      "step": 430
+    },
+    {
+      "epoch": 1.6062846580406656,
+      "grad_norm": 1.1289736032485962,
+      "learning_rate": 1.4572768628628786e-05,
+      "loss": 0.4848,
+      "step": 435
+    },
+    {
+      "epoch": 1.6247689463955637,
+      "grad_norm": 1.2663849592208862,
+      "learning_rate": 1.42677953951726e-05,
+      "loss": 0.4793,
+      "step": 440
+    },
+    {
+      "epoch": 1.6432532347504623,
+      "grad_norm": 1.1098202466964722,
+      "learning_rate": 1.3963125287323583e-05,
+      "loss": 0.4616,
+      "step": 445
+    },
+    {
+      "epoch": 1.6617375231053604,
+      "grad_norm": 1.1383061408996582,
+      "learning_rate": 1.365888443555238e-05,
+      "loss": 0.4541,
+      "step": 450
+    },
+    {
+      "epoch": 1.6802218114602587,
+      "grad_norm": 1.0592122077941895,
+      "learning_rate": 1.3355198792621797e-05,
+      "loss": 0.4396,
+      "step": 455
+    },
+    {
+      "epoch": 1.698706099815157,
+      "grad_norm": 1.1581696271896362,
+      "learning_rate": 1.3052194081443572e-05,
+      "loss": 0.4608,
+      "step": 460
+    },
+    {
+      "epoch": 1.7171903881700554,
+      "grad_norm": 1.10267174243927,
+      "learning_rate": 1.2749995743030283e-05,
+      "loss": 0.4491,
+      "step": 465
+    },
+    {
+      "epoch": 1.7356746765249538,
+      "grad_norm": 1.0157338380813599,
+      "learning_rate": 1.2448728884564003e-05,
+      "loss": 0.4527,
+      "step": 470
+    },
+    {
+      "epoch": 1.7541589648798521,
+      "grad_norm": 1.4674980640411377,
+      "learning_rate": 1.2148518227603152e-05,
+      "loss": 0.4414,
+      "step": 475
+    },
+    {
+      "epoch": 1.7726432532347505,
+      "grad_norm": 1.2059624195098877,
+      "learning_rate": 1.1849488056448952e-05,
+      "loss": 0.4534,
+      "step": 480
+    },
+    {
+      "epoch": 1.7911275415896488,
+      "grad_norm": 1.1866525411605835,
+      "learning_rate": 1.1551762166692958e-05,
+      "loss": 0.4404,
+      "step": 485
+    },
+    {
+      "epoch": 1.8096118299445472,
+      "grad_norm": 1.2624088525772095,
+      "learning_rate": 1.1255463813966871e-05,
+      "loss": 0.4646,
+      "step": 490
+    },
+    {
+      "epoch": 1.8280961182994455,
+      "grad_norm": 1.128043293952942,
+      "learning_rate": 1.0960715662915888e-05,
+      "loss": 0.42,
+      "step": 495
+    },
+    {
+      "epoch": 1.8465804066543439,
+      "grad_norm": 1.187943696975708,
+      "learning_rate": 1.0667639736416759e-05,
+      "loss": 0.4301,
+      "step": 500
+    },
+    {
+      "epoch": 1.865064695009242,
+      "grad_norm": 1.2349821329116821,
+      "learning_rate": 1.037635736506148e-05,
+      "loss": 0.3983,
+      "step": 505
+    },
+    {
+      "epoch": 1.8835489833641406,
+      "grad_norm": 1.1217410564422607,
+      "learning_rate": 1.0086989136927602e-05,
+      "loss": 0.4225,
+      "step": 510
+    },
+    {
+      "epoch": 1.9020332717190387,
+      "grad_norm": 1.1299536228179932,
+      "learning_rate": 9.799654847655951e-06,
+      "loss": 0.3952,
+      "step": 515
+    },
+    {
+      "epoch": 1.9205175600739373,
+      "grad_norm": 1.186105728149414,
+      "learning_rate": 9.514473450856367e-06,
+      "loss": 0.4042,
+      "step": 520
+    },
+    {
+      "epoch": 1.9390018484288354,
+      "grad_norm": 1.1584864854812622,
+      "learning_rate": 9.231563008862096e-06,
+      "loss": 0.4039,
+      "step": 525
+    },
+    {
+      "epoch": 1.957486136783734,
+      "grad_norm": 1.3031147718429565,
+      "learning_rate": 8.95104064385311e-06,
+      "loss": 0.4166,
+      "step": 530
+    },
+    {
+      "epoch": 1.975970425138632,
+      "grad_norm": 1.181787371635437,
+      "learning_rate": 8.673022489368662e-06,
+      "loss": 0.3907,
+      "step": 535
+    },
+    {
+      "epoch": 1.9944547134935307,
+      "grad_norm": 1.4096273183822632,
+      "learning_rate": 8.397623642229126e-06,
+      "loss": 0.4074,
+      "step": 540
+    },
+    {
+      "epoch": 2.011090573012939,
+      "grad_norm": 1.0593628883361816,
+      "learning_rate": 8.124958114887027e-06,
+      "loss": 0.3666,
+      "step": 545
+    },
+    {
+      "epoch": 2.0295748613678373,
+      "grad_norm": 1.3195940256118774,
+      "learning_rate": 7.855138788227003e-06,
+      "loss": 0.3138,
+      "step": 550
+    },
+    {
+      "epoch": 2.048059149722736,
+      "grad_norm": 1.1379930973052979,
+      "learning_rate": 7.588277364834209e-06,
+      "loss": 0.3354,
+      "step": 555
+    },
+    {
+      "epoch": 2.066543438077634,
+      "grad_norm": 1.1683405637741089,
+      "learning_rate": 7.3244843227505225e-06,
+      "loss": 0.3417,
+      "step": 560
+    },
+    {
+      "epoch": 2.0850277264325325,
+      "grad_norm": 1.1039248704910278,
+      "learning_rate": 7.063868869737703e-06,
+      "loss": 0.3105,
+      "step": 565
+    },
+    {
+      "epoch": 2.1035120147874307,
+      "grad_norm": 1.1492136716842651,
+      "learning_rate": 6.806538898066443e-06,
+      "loss": 0.3276,
+      "step": 570
+    },
+    {
+      "epoch": 2.1219963031423292,
+      "grad_norm": 1.5073720216751099,
+      "learning_rate": 6.552600939850021e-06,
+      "loss": 0.3469,
+      "step": 575
+    },
+    {
+      "epoch": 2.1404805914972274,
+      "grad_norm": 1.2041940689086914,
+      "learning_rate": 6.302160122941039e-06,
+      "loss": 0.3524,
+      "step": 580
+    },
+    {
+      "epoch": 2.1589648798521255,
+      "grad_norm": 1.1772723197937012,
+      "learning_rate": 6.055320127409497e-06,
+      "loss": 0.2891,
+      "step": 585
+    },
+    {
+      "epoch": 2.177449168207024,
+      "grad_norm": 1.0952905416488647,
+      "learning_rate": 5.8121831426202535e-06,
+      "loss": 0.2949,
+      "step": 590
+    },
+    {
+      "epoch": 2.195933456561922,
+      "grad_norm": 1.3353582620620728,
+      "learning_rate": 5.572849824927625e-06,
+      "loss": 0.3531,
+      "step": 595
+    },
+    {
+      "epoch": 2.2144177449168208,
+      "grad_norm": 1.162351369857788,
+      "learning_rate": 5.337419256004635e-06,
+      "loss": 0.3211,
+      "step": 600
+    },
+    {
+      "epoch": 2.232902033271719,
+      "grad_norm": 1.3160362243652344,
+      "learning_rate": 5.105988901824154e-06,
+      "loss": 0.2972,
+      "step": 605
+    },
+    {
+      "epoch": 2.2513863216266174,
+      "grad_norm": 1.189749002456665,
+      "learning_rate": 4.878654572308936e-06,
+      "loss": 0.3115,
+      "step": 610
+    },
+    {
+      "epoch": 2.2698706099815156,
+      "grad_norm": 1.1525466442108154,
+      "learning_rate": 4.65551038166723e-06,
+      "loss": 0.318,
+      "step": 615
+    },
+    {
+      "epoch": 2.288354898336414,
+      "grad_norm": 1.1686450242996216,
+      "learning_rate": 4.436648709430419e-06,
+      "loss": 0.3377,
+      "step": 620
+    },
+    {
+      "epoch": 2.3068391866913123,
+      "grad_norm": 1.4562509059906006,
+      "learning_rate": 4.222160162208794e-06,
+      "loss": 0.3416,
+      "step": 625
+    },
+    {
+      "epoch": 2.325323475046211,
+      "grad_norm": 1.1834423542022705,
+      "learning_rate": 4.0121335361812885e-06,
+      "loss": 0.2834,
+      "step": 630
+    },
+    {
+      "epoch": 2.343807763401109,
+      "grad_norm": 1.3250130414962769,
+      "learning_rate": 3.8066557803347275e-06,
+      "loss": 0.3349,
+      "step": 635
+    },
+    {
+      "epoch": 2.3622920517560075,
+      "grad_norm": 1.1929954290390015,
+      "learning_rate": 3.605811960467774e-06,
+      "loss": 0.3112,
+      "step": 640
+    },
+    {
+      "epoch": 2.3807763401109057,
+      "grad_norm": 1.1679567098617554,
+      "learning_rate": 3.409685223974529e-06,
+      "loss": 0.3134,
+      "step": 645
+    },
+    {
+      "epoch": 2.3992606284658042,
+      "grad_norm": 1.2451035976409912,
+      "learning_rate": 3.218356765422317e-06,
+      "loss": 0.3175,
+      "step": 650
+    },
+    {
+      "epoch": 2.4177449168207024,
+      "grad_norm": 1.301519513130188,
+      "learning_rate": 3.0319057929379145e-06,
+      "loss": 0.3134,
+      "step": 655
+    },
+    {
+      "epoch": 2.436229205175601,
+      "grad_norm": 1.1192182302474976,
+      "learning_rate": 2.850409495416167e-06,
+      "loss": 0.3448,
+      "step": 660
+    },
+    {
+      "epoch": 2.454713493530499,
+      "grad_norm": 1.1649621725082397,
+      "learning_rate": 2.673943010564496e-06,
+      "loss": 0.2956,
+      "step": 665
+    },
+    {
+      "epoch": 2.473197781885397,
+      "grad_norm": 1.1658220291137695,
+      "learning_rate": 2.5025793937966744e-06,
+      "loss": 0.2783,
+      "step": 670
+    },
+    {
+      "epoch": 2.4916820702402958,
+      "grad_norm": 1.1408849954605103,
+      "learning_rate": 2.336389587988552e-06,
+      "loss": 0.2982,
+      "step": 675
+    },
+    {
+      "epoch": 2.5101663585951943,
+      "grad_norm": 1.0377517938613892,
+      "learning_rate": 2.1754423941084086e-06,
+      "loss": 0.2628,
+      "step": 680
+    },
+    {
+      "epoch": 2.5286506469500925,
+      "grad_norm": 1.2771639823913574,
+      "learning_rate": 2.0198044427340116e-06,
+      "loss": 0.3142,
+      "step": 685
+    },
+    {
+      "epoch": 2.5471349353049906,
+      "grad_norm": 1.218306064605713,
+      "learning_rate": 1.8695401664682088e-06,
+      "loss": 0.3023,
+      "step": 690
+    },
+    {
+      "epoch": 2.565619223659889,
+      "grad_norm": 1.2790991067886353,
+      "learning_rate": 1.7247117732644596e-06,
+      "loss": 0.308,
+      "step": 695
+    },
+    {
+      "epoch": 2.5841035120147873,
+      "grad_norm": 1.2583755254745483,
+      "learning_rate": 1.5853792206733369e-06,
+      "loss": 0.3019,
+      "step": 700
+    },
+    {
+      "epoch": 2.602587800369686,
+      "grad_norm": 1.2065449953079224,
+      "learning_rate": 1.45160019102069e-06,
+      "loss": 0.284,
+      "step": 705
+    },
+    {
+      "epoch": 2.621072088724584,
+      "grad_norm": 1.192649483680725,
+      "learning_rate": 1.3234300675277094e-06,
+      "loss": 0.3328,
+      "step": 710
+    },
+    {
+      "epoch": 2.6395563770794825,
+      "grad_norm": 1.080551028251648,
+      "learning_rate": 1.2009219113828184e-06,
+      "loss": 0.2937,
+      "step": 715
+    },
+    {
+      "epoch": 2.6580406654343807,
+      "grad_norm": 1.1941901445388794,
+      "learning_rate": 1.084126439774864e-06,
+      "loss": 0.2723,
+      "step": 720
+    },
+    {
+      "epoch": 2.6765249537892792,
+      "grad_norm": 1.2448822259902954,
+      "learning_rate": 9.730920048966757e-07,
+      "loss": 0.2898,
+      "step": 725
+    },
+    {
+      "epoch": 2.6950092421441774,
+      "grad_norm": 1.1848639249801636,
+      "learning_rate": 8.678645739277396e-07,
+      "loss": 0.2818,
+      "step": 730
+    },
+    {
+      "epoch": 2.713493530499076,
+      "grad_norm": 1.2146798372268677,
+      "learning_rate": 7.684877100042192e-07,
+      "loss": 0.2966,
+      "step": 735
+    },
+    {
+      "epoch": 2.731977818853974,
+      "grad_norm": 1.1813101768493652,
+      "learning_rate": 6.750025541842497e-07,
+      "loss": 0.3164,
+      "step": 740
+    },
+    {
+      "epoch": 2.7504621072088726,
+      "grad_norm": 1.1815142631530762,
+      "learning_rate": 5.874478084159374e-07,
+      "loss": 0.3016,
+      "step": 745
+    },
+    {
+      "epoch": 2.7689463955637708,
+      "grad_norm": 1.1811829805374146,
+      "learning_rate": 5.058597195151276e-07,
+      "loss": 0.308,
+      "step": 750
+    },
+    {
+      "epoch": 2.787430683918669,
+      "grad_norm": 1.2127840518951416,
+      "learning_rate": 4.3027206415957034e-07,
+      "loss": 0.3465,
+      "step": 755
+    },
+    {
+      "epoch": 2.8059149722735675,
+      "grad_norm": 1.2649399042129517,
+      "learning_rate": 3.607161349057064e-07,
+      "loss": 0.2803,
+      "step": 760
+    },
+    {
+      "epoch": 2.824399260628466,
+      "grad_norm": 1.1885586977005005,
+      "learning_rate": 2.9722072723385283e-07,
+      "loss": 0.2765,
+      "step": 765
+    },
+    {
+      "epoch": 2.842883548983364,
+      "grad_norm": 1.1760481595993042,
+      "learning_rate": 2.3981212762715475e-07,
+      "loss": 0.3255,
+      "step": 770
+    },
+    {
+      "epoch": 2.8613678373382623,
+      "grad_norm": 1.0989104509353638,
+      "learning_rate": 1.885141026892323e-07,
+      "loss": 0.3044,
+      "step": 775
+    },
+    {
+      "epoch": 2.879852125693161,
+      "grad_norm": 1.2079111337661743,
+      "learning_rate": 1.4334788930504273e-07,
+      "loss": 0.2889,
+      "step": 780
+    },
+    {
+      "epoch": 2.8983364140480594,
+      "grad_norm": 1.239363431930542,
+      "learning_rate": 1.043321858490015e-07,
+      "loss": 0.3203,
+      "step": 785
+    },
+    {
+      "epoch": 2.9168207024029575,
+      "grad_norm": 1.2558146715164185,
+      "learning_rate": 7.148314444405946e-08,
+      "loss": 0.3306,
+      "step": 790
+    },
+    {
+      "epoch": 2.9353049907578557,
+      "grad_norm": 1.153817057609558,
+      "learning_rate": 4.4814364274863294e-08,
+      "loss": 0.2655,
+      "step": 795
+    },
+    {
+      "epoch": 2.9537892791127542,
+      "grad_norm": 1.2156027555465698,
+      "learning_rate": 2.433688595783079e-08,
+      "loss": 0.2967,
+      "step": 800
+    },
+    {
+      "epoch": 2.9722735674676524,
+      "grad_norm": 1.206207275390625,
+      "learning_rate": 1.0059186970440282e-08,
+      "loss": 0.3296,
+      "step": 805
+    },
+    {
+      "epoch": 2.990757855822551,
+      "grad_norm": 1.1297657489776611,
+      "learning_rate": 1.987178141641399e-09,
+      "loss": 0.281,
+      "step": 810
+    },
+    {
+      "epoch": 3.0,
+      "step": 813,
+      "total_flos": 1.140044753493033e+18,
+      "train_loss": 0.6133637193677464,
+      "train_runtime": 530.5624,
+      "train_samples_per_second": 48.888,
+      "train_steps_per_second": 1.532
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 813,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.140044753493033e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

125_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cba3dd3c7d6a2cc510eed74004672d2e3e62df6eb9f68c3ed60a8ba630fd14e6
+size 8273

125_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff