Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

10_128_e3_3e-5/adapter_config.json +39 -0
10_128_e3_3e-5/adapter_model.safetensors +3 -0
10_128_e3_3e-5/added_tokens.json +9 -0
10_128_e3_3e-5/all_results.json +9 -0
10_128_e3_3e-5/chat_template.jinja +62 -0
10_128_e3_3e-5/config.json +32 -0
10_128_e3_3e-5/merges.txt +0 -0
10_128_e3_3e-5/special_tokens_map.json +33 -0
10_128_e3_3e-5/tokenizer.json +0 -0
10_128_e3_3e-5/tokenizer_config.json +234 -0
10_128_e3_3e-5/train_results.json +9 -0
10_128_e3_3e-5/trainer_state.json +1163 -0
10_128_e3_3e-5/training_args.bin +3 -0
10_128_e3_3e-5/vocab.json +0 -0

10_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

10_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c933338db760ec6afc4b4eafde69c23f7c2ad47a9c681367fc4d3e8ba4038e1
+size 791751704

10_128_e3_3e-5/added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|end_of_cite|>": 49156,
+  "<|end_of_plugin|>": 49158,
+  "<|end_of_role|>": 49153,
+  "<|start_of_cite|>": 49155,
+  "<|start_of_plugin|>": 49157,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

10_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.127223997277143e+18,
+    "train_loss": 0.6140232465500838,
+    "train_runtime": 553.9891,
+    "train_samples": 8523,
+    "train_samples_per_second": 46.154,
+    "train_steps_per_second": 1.446
+}

10_128_e3_3e-5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,62 @@

+{# Alias tools -> available_tools #}
+{%- if tools and not available_tools -%}
+    {%- set available_tools = tools -%}
+{%- endif -%}
+{%- if messages[0]['role'] == 'system' %}
+     {%- set system_message = messages[0]['content'] %}
+     {%- set loop_messages = messages[1:] %}
+ {%- else %}
+     {%- set system_message = "Knowledge Cutoff Date: April 2024.
+Today's Date: " + strftime_now('%B %d, %Y') + ".
+You are Granite, developed by IBM." %}
+     {%- if available_tools and documents %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+     {%- elif available_tools %}
+         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+     {%- elif documents %}
+         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+    {%- elif thinking %}
+    {%- set system_message = system_message + " You are a helpful AI assistant.
+Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
+     {%- else %}
+         {%- set system_message = system_message + " You are a helpful AI assistant." %}
+     {%- endif %}
+     {%- if 'citations' in controls and documents %}
+         {%- set system_message = system_message + '
+Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+     {%- endif %}
+     {%- if 'hallucinations' in controls and documents %}
+         {%- set system_message = system_message + '
+Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
+     {%- endif %}
+     {%- set loop_messages = messages %}
+ {%- endif %}
+ {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
+' }}
+ {%- if available_tools %}
+     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
+     {{- available_tools | tojson(indent=4) }}
+     {{- '<|end_of_text|>
+' }}
+ {%- endif %}
+ {%- if documents %}
+     {%- for document in documents %}
+         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
+' }}
+         {{- document['text'] }}
+         {{- '<|end_of_text|>
+' }}
+              {%- endfor %}
+ {%- endif %}
+ {%- for message in loop_messages %}
+     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+     {%- if loop.last and add_generation_prompt %}
+         {{- '<|start_of_role|>assistant' }}
+             {%- if controls %}
+                 {{- ' ' + controls | tojson()}}
+             {%- endif %}
+         {{- '<|end_of_role|>' }}
+     {%- endif %}
+ {%- endfor %}

10_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49159
+}

10_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

10_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_plugin|>",
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

10_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

10_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<|start_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<|end_of_cite|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>",
+    "<|start_of_cite|>",
+    "<|end_of_cite|>",
+    "<|start_of_plugin|>",
+    "<|end_of_plugin|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end_of_plugin|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

10_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.127223997277143e+18,
+    "train_loss": 0.6140232465500838,
+    "train_runtime": 553.9891,
+    "train_samples": 8523,
+    "train_samples_per_second": 46.154,
+    "train_steps_per_second": 1.446
+}

10_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1163 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 801,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01876172607879925,
+      "grad_norm": 3.022392988204956,
+      "learning_rate": 2.9268292682926833e-06,
+      "loss": 1.5432,
+      "step": 5
+    },
+    {
+      "epoch": 0.0375234521575985,
+      "grad_norm": 1.6108911037445068,
+      "learning_rate": 6.585365853658537e-06,
+      "loss": 1.5286,
+      "step": 10
+    },
+    {
+      "epoch": 0.05628517823639775,
+      "grad_norm": 0.632531464099884,
+      "learning_rate": 1.024390243902439e-05,
+      "loss": 1.4185,
+      "step": 15
+    },
+    {
+      "epoch": 0.075046904315197,
+      "grad_norm": 0.41699159145355225,
+      "learning_rate": 1.3902439024390245e-05,
+      "loss": 1.3364,
+      "step": 20
+    },
+    {
+      "epoch": 0.09380863039399624,
+      "grad_norm": 0.46378299593925476,
+      "learning_rate": 1.7560975609756096e-05,
+      "loss": 1.3394,
+      "step": 25
+    },
+    {
+      "epoch": 0.1125703564727955,
+      "grad_norm": 0.4041644334793091,
+      "learning_rate": 2.121951219512195e-05,
+      "loss": 1.3112,
+      "step": 30
+    },
+    {
+      "epoch": 0.13133208255159476,
+      "grad_norm": 0.4791252315044403,
+      "learning_rate": 2.4878048780487805e-05,
+      "loss": 1.3009,
+      "step": 35
+    },
+    {
+      "epoch": 0.150093808630394,
+      "grad_norm": 0.3885604441165924,
+      "learning_rate": 2.8536585365853658e-05,
+      "loss": 1.2593,
+      "step": 40
+    },
+    {
+      "epoch": 0.16885553470919323,
+      "grad_norm": 0.36132970452308655,
+      "learning_rate": 2.999884662437762e-05,
+      "loss": 1.2866,
+      "step": 45
+    },
+    {
+      "epoch": 0.18761726078799248,
+      "grad_norm": 0.3579188883304596,
+      "learning_rate": 2.999179886011389e-05,
+      "loss": 1.2833,
+      "step": 50
+    },
+    {
+      "epoch": 0.20637898686679174,
+      "grad_norm": 0.4507288634777069,
+      "learning_rate": 2.9978347102739695e-05,
+      "loss": 1.2055,
+      "step": 55
+    },
+    {
+      "epoch": 0.225140712945591,
+      "grad_norm": 0.3547620475292206,
+      "learning_rate": 2.9958497098394115e-05,
+      "loss": 1.2339,
+      "step": 60
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.40093785524368286,
+      "learning_rate": 2.9932257326332545e-05,
+      "loss": 1.1664,
+      "step": 65
+    },
+    {
+      "epoch": 0.2626641651031895,
+      "grad_norm": 0.4169408977031708,
+      "learning_rate": 2.9899638995304575e-05,
+      "loss": 1.1707,
+      "step": 70
+    },
+    {
+      "epoch": 0.28142589118198874,
+      "grad_norm": 0.44372236728668213,
+      "learning_rate": 2.9860656038766035e-05,
+      "loss": 1.1497,
+      "step": 75
+    },
+    {
+      "epoch": 0.300187617260788,
+      "grad_norm": 0.43783459067344666,
+      "learning_rate": 2.981532510892707e-05,
+      "loss": 1.1128,
+      "step": 80
+    },
+    {
+      "epoch": 0.31894934333958724,
+      "grad_norm": 0.49107587337493896,
+      "learning_rate": 2.9763665569638878e-05,
+      "loss": 1.1998,
+      "step": 85
+    },
+    {
+      "epoch": 0.33771106941838647,
+      "grad_norm": 0.4160652160644531,
+      "learning_rate": 2.970569948812214e-05,
+      "loss": 1.1335,
+      "step": 90
+    },
+    {
+      "epoch": 0.35647279549718575,
+      "grad_norm": 0.5091213583946228,
+      "learning_rate": 2.964145162554061e-05,
+      "loss": 1.1229,
+      "step": 95
+    },
+    {
+      "epoch": 0.37523452157598497,
+      "grad_norm": 0.5193919539451599,
+      "learning_rate": 2.9570949426424015e-05,
+      "loss": 1.0148,
+      "step": 100
+    },
+    {
+      "epoch": 0.39399624765478425,
+      "grad_norm": 0.5774475932121277,
+      "learning_rate": 2.9494223006944636e-05,
+      "loss": 1.0851,
+      "step": 105
+    },
+    {
+      "epoch": 0.41275797373358347,
+      "grad_norm": 0.559084415435791,
+      "learning_rate": 2.9411305142052725e-05,
+      "loss": 1.0668,
+      "step": 110
+    },
+    {
+      "epoch": 0.43151969981238275,
+      "grad_norm": 0.5823184847831726,
+      "learning_rate": 2.9322231251476117e-05,
+      "loss": 1.0628,
+      "step": 115
+    },
+    {
+      "epoch": 0.450281425891182,
+      "grad_norm": 0.456752747297287,
+      "learning_rate": 2.9227039384590143e-05,
+      "loss": 1.0284,
+      "step": 120
+    },
+    {
+      "epoch": 0.46904315196998125,
+      "grad_norm": 0.5233507752418518,
+      "learning_rate": 2.9125770204164183e-05,
+      "loss": 0.9821,
+      "step": 125
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.5157922506332397,
+      "learning_rate": 2.9018466968991913e-05,
+      "loss": 1.0172,
+      "step": 130
+    },
+    {
+      "epoch": 0.5065666041275797,
+      "grad_norm": 0.5927984714508057,
+      "learning_rate": 2.8905175515412607e-05,
+      "loss": 1.0262,
+      "step": 135
+    },
+    {
+      "epoch": 0.525328330206379,
+      "grad_norm": 0.577369749546051,
+      "learning_rate": 2.8785944237731366e-05,
+      "loss": 0.9844,
+      "step": 140
+    },
+    {
+      "epoch": 0.5440900562851783,
+      "grad_norm": 0.6549458503723145,
+      "learning_rate": 2.8660824067546728e-05,
+      "loss": 0.9926,
+      "step": 145
+    },
+    {
+      "epoch": 0.5628517823639775,
+      "grad_norm": 0.5884875655174255,
+      "learning_rate": 2.8529868451994387e-05,
+      "loss": 0.9397,
+      "step": 150
+    },
+    {
+      "epoch": 0.5816135084427767,
+      "grad_norm": 0.6765407919883728,
+      "learning_rate": 2.8393133330916343e-05,
+      "loss": 1.0031,
+      "step": 155
+    },
+    {
+      "epoch": 0.600375234521576,
+      "grad_norm": 0.6589024066925049,
+      "learning_rate": 2.825067711296533e-05,
+      "loss": 1.005,
+      "step": 160
+    },
+    {
+      "epoch": 0.6191369606003753,
+      "grad_norm": 0.7365466952323914,
+      "learning_rate": 2.8102560650654564e-05,
+      "loss": 0.92,
+      "step": 165
+    },
+    {
+      "epoch": 0.6378986866791745,
+      "grad_norm": 0.7390749454498291,
+      "learning_rate": 2.794884721436361e-05,
+      "loss": 0.8568,
+      "step": 170
+    },
+    {
+      "epoch": 0.6566604127579737,
+      "grad_norm": 0.7310432195663452,
+      "learning_rate": 2.7789602465311384e-05,
+      "loss": 0.882,
+      "step": 175
+    },
+    {
+      "epoch": 0.6754221388367729,
+      "grad_norm": 0.7154702544212341,
+      "learning_rate": 2.7624894427507907e-05,
+      "loss": 0.8418,
+      "step": 180
+    },
+    {
+      "epoch": 0.6941838649155723,
+      "grad_norm": 0.7102565765380859,
+      "learning_rate": 2.745479345869671e-05,
+      "loss": 0.8737,
+      "step": 185
+    },
+    {
+      "epoch": 0.7129455909943715,
+      "grad_norm": 0.79137122631073,
+      "learning_rate": 2.727937222030039e-05,
+      "loss": 0.8999,
+      "step": 190
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.8127190470695496,
+      "learning_rate": 2.709870564638206e-05,
+      "loss": 0.8011,
+      "step": 195
+    },
+    {
+      "epoch": 0.7504690431519699,
+      "grad_norm": 0.7601691484451294,
+      "learning_rate": 2.6912870911636064e-05,
+      "loss": 0.858,
+      "step": 200
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.778498113155365,
+      "learning_rate": 2.6721947398421535e-05,
+      "loss": 0.792,
+      "step": 205
+    },
+    {
+      "epoch": 0.7879924953095685,
+      "grad_norm": 0.7714143991470337,
+      "learning_rate": 2.6526016662852887e-05,
+      "loss": 0.8191,
+      "step": 210
+    },
+    {
+      "epoch": 0.8067542213883677,
+      "grad_norm": 0.8038144707679749,
+      "learning_rate": 2.632516239996181e-05,
+      "loss": 0.7958,
+      "step": 215
+    },
+    {
+      "epoch": 0.8255159474671669,
+      "grad_norm": 0.9297365546226501,
+      "learning_rate": 2.6119470407945566e-05,
+      "loss": 0.8287,
+      "step": 220
+    },
+    {
+      "epoch": 0.8442776735459663,
+      "grad_norm": 0.7835946083068848,
+      "learning_rate": 2.5909028551516845e-05,
+      "loss": 0.7888,
+      "step": 225
+    },
+    {
+      "epoch": 0.8630393996247655,
+      "grad_norm": 0.8794981837272644,
+      "learning_rate": 2.5693926724370958e-05,
+      "loss": 0.8048,
+      "step": 230
+    },
+    {
+      "epoch": 0.8818011257035647,
+      "grad_norm": 0.8462761640548706,
+      "learning_rate": 2.5474256810786226e-05,
+      "loss": 0.7762,
+      "step": 235
+    },
+    {
+      "epoch": 0.900562851782364,
+      "grad_norm": 0.7827888131141663,
+      "learning_rate": 2.5250112646374125e-05,
+      "loss": 0.7393,
+      "step": 240
+    },
+    {
+      "epoch": 0.9193245778611632,
+      "grad_norm": 0.912244439125061,
+      "learning_rate": 2.5021589977995867e-05,
+      "loss": 0.7827,
+      "step": 245
+    },
+    {
+      "epoch": 0.9380863039399625,
+      "grad_norm": 0.8597365617752075,
+      "learning_rate": 2.478878642286253e-05,
+      "loss": 0.8052,
+      "step": 250
+    },
+    {
+      "epoch": 0.9568480300187617,
+      "grad_norm": 0.9068575501441956,
+      "learning_rate": 2.4551801426836288e-05,
+      "loss": 0.7137,
+      "step": 255
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.8579255938529968,
+      "learning_rate": 2.431073622195047e-05,
+      "loss": 0.7523,
+      "step": 260
+    },
+    {
+      "epoch": 0.9943714821763602,
+      "grad_norm": 1.0096858739852905,
+      "learning_rate": 2.4065693783166626e-05,
+      "loss": 0.7339,
+      "step": 265
+    },
+    {
+      "epoch": 1.0112570356472796,
+      "grad_norm": 0.9384812116622925,
+      "learning_rate": 2.3816778784387097e-05,
+      "loss": 0.6596,
+      "step": 270
+    },
+    {
+      "epoch": 1.0300187617260788,
+      "grad_norm": 0.9697140455245972,
+      "learning_rate": 2.3564097553741828e-05,
+      "loss": 0.6048,
+      "step": 275
+    },
+    {
+      "epoch": 1.048780487804878,
+      "grad_norm": 0.9159317016601562,
+      "learning_rate": 2.330775802816856e-05,
+      "loss": 0.626,
+      "step": 280
+    },
+    {
+      "epoch": 1.0675422138836772,
+      "grad_norm": 1.122734785079956,
+      "learning_rate": 2.3047869707305794e-05,
+      "loss": 0.6578,
+      "step": 285
+    },
+    {
+      "epoch": 1.0863039399624765,
+      "grad_norm": 1.0670586824417114,
+      "learning_rate": 2.2784543606718227e-05,
+      "loss": 0.591,
+      "step": 290
+    },
+    {
+      "epoch": 1.1050656660412759,
+      "grad_norm": 1.135185718536377,
+      "learning_rate": 2.251789221047464e-05,
+      "loss": 0.5657,
+      "step": 295
+    },
+    {
+      "epoch": 1.123827392120075,
+      "grad_norm": 0.9424425363540649,
+      "learning_rate": 2.2248029423098443e-05,
+      "loss": 0.5869,
+      "step": 300
+    },
+    {
+      "epoch": 1.1425891181988743,
+      "grad_norm": 0.9191554188728333,
+      "learning_rate": 2.1975070520911534e-05,
+      "loss": 0.5947,
+      "step": 305
+    },
+    {
+      "epoch": 1.1613508442776737,
+      "grad_norm": 1.005942940711975,
+      "learning_rate": 2.16991321027921e-05,
+      "loss": 0.592,
+      "step": 310
+    },
+    {
+      "epoch": 1.1801125703564728,
+      "grad_norm": 1.048733115196228,
+      "learning_rate": 2.1420332040367486e-05,
+      "loss": 0.6034,
+      "step": 315
+    },
+    {
+      "epoch": 1.198874296435272,
+      "grad_norm": 1.034255862236023,
+      "learning_rate": 2.113878942766341e-05,
+      "loss": 0.5824,
+      "step": 320
+    },
+    {
+      "epoch": 1.2176360225140712,
+      "grad_norm": 1.010331392288208,
+      "learning_rate": 2.0854624530230984e-05,
+      "loss": 0.5624,
+      "step": 325
+    },
+    {
+      "epoch": 1.2363977485928705,
+      "grad_norm": 1.2689473628997803,
+      "learning_rate": 2.056795873377331e-05,
+      "loss": 0.543,
+      "step": 330
+    },
+    {
+      "epoch": 1.2551594746716699,
+      "grad_norm": 1.0222091674804688,
+      "learning_rate": 2.0278914492293585e-05,
+      "loss": 0.5726,
+      "step": 335
+    },
+    {
+      "epoch": 1.273921200750469,
+      "grad_norm": 1.091130256652832,
+      "learning_rate": 1.9987615275786852e-05,
+      "loss": 0.5075,
+      "step": 340
+    },
+    {
+      "epoch": 1.2926829268292683,
+      "grad_norm": 1.0246487855911255,
+      "learning_rate": 1.9694185517497785e-05,
+      "loss": 0.5674,
+      "step": 345
+    },
+    {
+      "epoch": 1.3114446529080674,
+      "grad_norm": 1.078443169593811,
+      "learning_rate": 1.9398750560766973e-05,
+      "loss": 0.5574,
+      "step": 350
+    },
+    {
+      "epoch": 1.3302063789868668,
+      "grad_norm": 1.0648638010025024,
+      "learning_rate": 1.910143660548844e-05,
+      "loss": 0.5921,
+      "step": 355
+    },
+    {
+      "epoch": 1.3489681050656661,
+      "grad_norm": 1.1092274188995361,
+      "learning_rate": 1.880237065420131e-05,
+      "loss": 0.5899,
+      "step": 360
+    },
+    {
+      "epoch": 1.3677298311444652,
+      "grad_norm": 1.1355493068695068,
+      "learning_rate": 1.8501680457838582e-05,
+      "loss": 0.5903,
+      "step": 365
+    },
+    {
+      "epoch": 1.3864915572232646,
+      "grad_norm": 1.1126688718795776,
+      "learning_rate": 1.8199494461156203e-05,
+      "loss": 0.5194,
+      "step": 370
+    },
+    {
+      "epoch": 1.4052532833020637,
+      "grad_norm": 1.0432655811309814,
+      "learning_rate": 1.7895941747865804e-05,
+      "loss": 0.4941,
+      "step": 375
+    },
+    {
+      "epoch": 1.424015009380863,
+      "grad_norm": 1.0098241567611694,
+      "learning_rate": 1.7591151985494456e-05,
+      "loss": 0.4818,
+      "step": 380
+    },
+    {
+      "epoch": 1.4427767354596623,
+      "grad_norm": 1.1722196340560913,
+      "learning_rate": 1.7285255369995066e-05,
+      "loss": 0.528,
+      "step": 385
+    },
+    {
+      "epoch": 1.4615384615384617,
+      "grad_norm": 1.1716938018798828,
+      "learning_rate": 1.6978382570131037e-05,
+      "loss": 0.515,
+      "step": 390
+    },
+    {
+      "epoch": 1.4803001876172608,
+      "grad_norm": 1.16000235080719,
+      "learning_rate": 1.6670664671658944e-05,
+      "loss": 0.5084,
+      "step": 395
+    },
+    {
+      "epoch": 1.49906191369606,
+      "grad_norm": 1.0876880884170532,
+      "learning_rate": 1.6362233121333124e-05,
+      "loss": 0.4756,
+      "step": 400
+    },
+    {
+      "epoch": 1.5178236397748592,
+      "grad_norm": 1.2415553331375122,
+      "learning_rate": 1.6053219670756022e-05,
+      "loss": 0.5233,
+      "step": 405
+    },
+    {
+      "epoch": 1.5365853658536586,
+      "grad_norm": 0.9916921257972717,
+      "learning_rate": 1.5743756320098334e-05,
+      "loss": 0.5095,
+      "step": 410
+    },
+    {
+      "epoch": 1.555347091932458,
+      "grad_norm": 1.1832914352416992,
+      "learning_rate": 1.5433975261712957e-05,
+      "loss": 0.5309,
+      "step": 415
+    },
+    {
+      "epoch": 1.574108818011257,
+      "grad_norm": 1.048359990119934,
+      "learning_rate": 1.5124008823666874e-05,
+      "loss": 0.4976,
+      "step": 420
+    },
+    {
+      "epoch": 1.5928705440900561,
+      "grad_norm": 1.1872073411941528,
+      "learning_rate": 1.4813989413215026e-05,
+      "loss": 0.5141,
+      "step": 425
+    },
+    {
+      "epoch": 1.6116322701688555,
+      "grad_norm": 1.178973913192749,
+      "learning_rate": 1.4504049460240376e-05,
+      "loss": 0.4242,
+      "step": 430
+    },
+    {
+      "epoch": 1.6303939962476548,
+      "grad_norm": 1.1775275468826294,
+      "learning_rate": 1.4194321360684354e-05,
+      "loss": 0.4882,
+      "step": 435
+    },
+    {
+      "epoch": 1.6491557223264541,
+      "grad_norm": 1.173196792602539,
+      "learning_rate": 1.3884937419991688e-05,
+      "loss": 0.4658,
+      "step": 440
+    },
+    {
+      "epoch": 1.6679174484052532,
+      "grad_norm": 1.0661191940307617,
+      "learning_rate": 1.3576029796594015e-05,
+      "loss": 0.4688,
+      "step": 445
+    },
+    {
+      "epoch": 1.6866791744840526,
+      "grad_norm": 1.1285613775253296,
+      "learning_rate": 1.3267730445456208e-05,
+      "loss": 0.4249,
+      "step": 450
+    },
+    {
+      "epoch": 1.7054409005628517,
+      "grad_norm": 1.1846303939819336,
+      "learning_rate": 1.2960171061709665e-05,
+      "loss": 0.4478,
+      "step": 455
+    },
+    {
+      "epoch": 1.724202626641651,
+      "grad_norm": 1.170326590538025,
+      "learning_rate": 1.2653483024396535e-05,
+      "loss": 0.4808,
+      "step": 460
+    },
+    {
+      "epoch": 1.7429643527204504,
+      "grad_norm": 1.1539356708526611,
+      "learning_rate": 1.234779734034906e-05,
+      "loss": 0.4667,
+      "step": 465
+    },
+    {
+      "epoch": 1.7617260787992497,
+      "grad_norm": 1.110994577407837,
+      "learning_rate": 1.2043244588227797e-05,
+      "loss": 0.4277,
+      "step": 470
+    },
+    {
+      "epoch": 1.7804878048780488,
+      "grad_norm": 1.0699677467346191,
+      "learning_rate": 1.1739954862742825e-05,
+      "loss": 0.4346,
+      "step": 475
+    },
+    {
+      "epoch": 1.799249530956848,
+      "grad_norm": 1.1005806922912598,
+      "learning_rate": 1.1438057719081672e-05,
+      "loss": 0.445,
+      "step": 480
+    },
+    {
+      "epoch": 1.8180112570356473,
+      "grad_norm": 1.1484843492507935,
+      "learning_rate": 1.1137682117567642e-05,
+      "loss": 0.4478,
+      "step": 485
+    },
+    {
+      "epoch": 1.8367729831144466,
+      "grad_norm": 1.14404296875,
+      "learning_rate": 1.0838956368572335e-05,
+      "loss": 0.405,
+      "step": 490
+    },
+    {
+      "epoch": 1.855534709193246,
+      "grad_norm": 1.087383508682251,
+      "learning_rate": 1.0542008077705742e-05,
+      "loss": 0.4957,
+      "step": 495
+    },
+    {
+      "epoch": 1.874296435272045,
+      "grad_norm": 1.2697890996932983,
+      "learning_rate": 1.0246964091307435e-05,
+      "loss": 0.3896,
+      "step": 500
+    },
+    {
+      "epoch": 1.8930581613508441,
+      "grad_norm": 1.3436408042907715,
+      "learning_rate": 9.953950442262046e-06,
+      "loss": 0.4224,
+      "step": 505
+    },
+    {
+      "epoch": 1.9118198874296435,
+      "grad_norm": 1.221205711364746,
+      "learning_rate": 9.663092296162252e-06,
+      "loss": 0.4055,
+      "step": 510
+    },
+    {
+      "epoch": 1.9305816135084428,
+      "grad_norm": 1.1539238691329956,
+      "learning_rate": 9.374513897842268e-06,
+      "loss": 0.4304,
+      "step": 515
+    },
+    {
+      "epoch": 1.9493433395872422,
+      "grad_norm": 1.243632197380066,
+      "learning_rate": 9.08833851830458e-06,
+      "loss": 0.4558,
+      "step": 520
+    },
+    {
+      "epoch": 1.9681050656660413,
+      "grad_norm": 1.2150799036026,
+      "learning_rate": 8.804688402062736e-06,
+      "loss": 0.4217,
+      "step": 525
+    },
+    {
+      "epoch": 1.9868667917448404,
+      "grad_norm": 1.3297826051712036,
+      "learning_rate": 8.523684714922608e-06,
+      "loss": 0.3718,
+      "step": 530
+    },
+    {
+      "epoch": 2.00375234521576,
+      "grad_norm": 1.0935436487197876,
+      "learning_rate": 8.245447492224397e-06,
+      "loss": 0.3678,
+      "step": 535
+    },
+    {
+      "epoch": 2.0225140712945593,
+      "grad_norm": 1.2081427574157715,
+      "learning_rate": 7.97009558756758e-06,
+      "loss": 0.3355,
+      "step": 540
+    },
+    {
+      "epoch": 2.041275797373358,
+      "grad_norm": 1.1939301490783691,
+      "learning_rate": 7.697746622040658e-06,
+      "loss": 0.3499,
+      "step": 545
+    },
+    {
+      "epoch": 2.0600375234521575,
+      "grad_norm": 1.1749287843704224,
+      "learning_rate": 7.4285169339773486e-06,
+      "loss": 0.3358,
+      "step": 550
+    },
+    {
+      "epoch": 2.078799249530957,
+      "grad_norm": 1.1871461868286133,
+      "learning_rate": 7.1625215292607685e-06,
+      "loss": 0.3343,
+      "step": 555
+    },
+    {
+      "epoch": 2.097560975609756,
+      "grad_norm": 1.2397356033325195,
+      "learning_rate": 6.899874032196796e-06,
+      "loss": 0.2902,
+      "step": 560
+    },
+    {
+      "epoch": 2.1163227016885555,
+      "grad_norm": 1.1503924131393433,
+      "learning_rate": 6.640686636977554e-06,
+      "loss": 0.314,
+      "step": 565
+    },
+    {
+      "epoch": 2.1350844277673544,
+      "grad_norm": 1.2462199926376343,
+      "learning_rate": 6.3850700597558465e-06,
+      "loss": 0.2878,
+      "step": 570
+    },
+    {
+      "epoch": 2.1538461538461537,
+      "grad_norm": 1.2414541244506836,
+      "learning_rate": 6.133133491350907e-06,
+      "loss": 0.3592,
+      "step": 575
+    },
+    {
+      "epoch": 2.172607879924953,
+      "grad_norm": 1.2128349542617798,
+      "learning_rate": 5.884984550605782e-06,
+      "loss": 0.3152,
+      "step": 580
+    },
+    {
+      "epoch": 2.1913696060037524,
+      "grad_norm": 1.3208764791488647,
+      "learning_rate": 5.640729238416137e-06,
+      "loss": 0.3456,
+      "step": 585
+    },
+    {
+      "epoch": 2.2101313320825517,
+      "grad_norm": 1.2864664793014526,
+      "learning_rate": 5.400471892450251e-06,
+      "loss": 0.3468,
+      "step": 590
+    },
+    {
+      "epoch": 2.2288930581613506,
+      "grad_norm": 1.1478925943374634,
+      "learning_rate": 5.164315142579485e-06,
+      "loss": 0.3226,
+      "step": 595
+    },
+    {
+      "epoch": 2.24765478424015,
+      "grad_norm": 1.2024633884429932,
+      "learning_rate": 4.93235986703821e-06,
+      "loss": 0.3672,
+      "step": 600
+    },
+    {
+      "epoch": 2.2664165103189493,
+      "grad_norm": 1.2632442712783813,
+      "learning_rate": 4.704705149332035e-06,
+      "loss": 0.3291,
+      "step": 605
+    },
+    {
+      "epoch": 2.2851782363977486,
+      "grad_norm": 1.3344372510910034,
+      "learning_rate": 4.481448235912671e-06,
+      "loss": 0.3132,
+      "step": 610
+    },
+    {
+      "epoch": 2.303939962476548,
+      "grad_norm": 1.2751656770706177,
+      "learning_rate": 4.262684494637483e-06,
+      "loss": 0.3121,
+      "step": 615
+    },
+    {
+      "epoch": 2.3227016885553473,
+      "grad_norm": 1.2522976398468018,
+      "learning_rate": 4.048507374031557e-06,
+      "loss": 0.3351,
+      "step": 620
+    },
+    {
+      "epoch": 2.341463414634146,
+      "grad_norm": 1.4454065561294556,
+      "learning_rate": 3.8390083633696434e-06,
+      "loss": 0.3227,
+      "step": 625
+    },
+    {
+      "epoch": 2.3602251407129455,
+      "grad_norm": 1.1708213090896606,
+      "learning_rate": 3.634276953594982e-06,
+      "loss": 0.3265,
+      "step": 630
+    },
+    {
+      "epoch": 2.378986866791745,
+      "grad_norm": 1.2387081384658813,
+      "learning_rate": 3.4344005990917936e-06,
+      "loss": 0.2965,
+      "step": 635
+    },
+    {
+      "epoch": 2.397748592870544,
+      "grad_norm": 1.227336049079895,
+      "learning_rate": 3.2394646803277063e-06,
+      "loss": 0.308,
+      "step": 640
+    },
+    {
+      "epoch": 2.416510318949343,
+      "grad_norm": 1.293278455734253,
+      "learning_rate": 3.049552467382071e-06,
+      "loss": 0.321,
+      "step": 645
+    },
+    {
+      "epoch": 2.4352720450281424,
+      "grad_norm": 1.2272025346755981,
+      "learning_rate": 2.86474508437579e-06,
+      "loss": 0.3423,
+      "step": 650
+    },
+    {
+      "epoch": 2.4540337711069418,
+      "grad_norm": 1.4468872547149658,
+      "learning_rate": 2.6851214748178223e-06,
+      "loss": 0.325,
+      "step": 655
+    },
+    {
+      "epoch": 2.472795497185741,
+      "grad_norm": 1.217806339263916,
+      "learning_rate": 2.5107583678831445e-06,
+      "loss": 0.3177,
+      "step": 660
+    },
+    {
+      "epoch": 2.4915572232645404,
+      "grad_norm": 1.196983814239502,
+      "learning_rate": 2.3417302456366586e-06,
+      "loss": 0.2996,
+      "step": 665
+    },
+    {
+      "epoch": 2.5103189493433398,
+      "grad_norm": 1.333940863609314,
+      "learning_rate": 2.1781093112169132e-06,
+      "loss": 0.3035,
+      "step": 670
+    },
+    {
+      "epoch": 2.529080675422139,
+      "grad_norm": 1.3397897481918335,
+      "learning_rate": 2.019965457993387e-06,
+      "loss": 0.3353,
+      "step": 675
+    },
+    {
+      "epoch": 2.547842401500938,
+      "grad_norm": 1.208940029144287,
+      "learning_rate": 1.867366239710358e-06,
+      "loss": 0.3079,
+      "step": 680
+    },
+    {
+      "epoch": 2.5666041275797373,
+      "grad_norm": 1.1906332969665527,
+      "learning_rate": 1.7203768416302213e-06,
+      "loss": 0.2834,
+      "step": 685
+    },
+    {
+      "epoch": 2.5853658536585367,
+      "grad_norm": 1.1831692457199097,
+      "learning_rate": 1.579060052688548e-06,
+      "loss": 0.2926,
+      "step": 690
+    },
+    {
+      "epoch": 2.604127579737336,
+      "grad_norm": 1.2483961582183838,
+      "learning_rate": 1.4434762386727386e-06,
+      "loss": 0.2704,
+      "step": 695
+    },
+    {
+      "epoch": 2.622889305816135,
+      "grad_norm": 1.1004767417907715,
+      "learning_rate": 1.313683316435793e-06,
+      "loss": 0.3065,
+      "step": 700
+    },
+    {
+      "epoch": 2.641651031894934,
+      "grad_norm": 1.183822751045227,
+      "learning_rate": 1.18973672915619e-06,
+      "loss": 0.2721,
+      "step": 705
+    },
+    {
+      "epoch": 2.6604127579737336,
+      "grad_norm": 1.156959891319275,
+      "learning_rate": 1.0716894226543954e-06,
+      "loss": 0.3419,
+      "step": 710
+    },
+    {
+      "epoch": 2.679174484052533,
+      "grad_norm": 1.2261924743652344,
+      "learning_rate": 9.595918227762052e-07,
+      "loss": 0.2696,
+      "step": 715
+    },
+    {
+      "epoch": 2.6979362101313322,
+      "grad_norm": 1.2564549446105957,
+      "learning_rate": 8.534918138525211e-07,
+      "loss": 0.2769,
+      "step": 720
+    },
+    {
+      "epoch": 2.7166979362101316,
+      "grad_norm": 1.3584355115890503,
+      "learning_rate": 7.534347182447521e-07,
+      "loss": 0.3273,
+      "step": 725
+    },
+    {
+      "epoch": 2.7354596622889304,
+      "grad_norm": 1.2448846101760864,
+      "learning_rate": 6.594632769846353e-07,
+      "loss": 0.2575,
+      "step": 730
+    },
+    {
+      "epoch": 2.75422138836773,
+      "grad_norm": 1.1208593845367432,
+      "learning_rate": 5.71617631516711e-07,
+      "loss": 0.2832,
+      "step": 735
+    },
+    {
+      "epoch": 2.772983114446529,
+      "grad_norm": 1.328932285308838,
+      "learning_rate": 4.899353065512263e-07,
+      "loss": 0.3448,
+      "step": 740
+    },
+    {
+      "epoch": 2.7917448405253285,
+      "grad_norm": 1.177563190460205,
+      "learning_rate": 4.1445119403485165e-07,
+      "loss": 0.2946,
+      "step": 745
+    },
+    {
+      "epoch": 2.8105065666041273,
+      "grad_norm": 1.1846716403961182,
+      "learning_rate": 3.45197538246011e-07,
+      "loss": 0.3084,
+      "step": 750
+    },
+    {
+      "epoch": 2.8292682926829267,
+      "grad_norm": 1.1097540855407715,
+      "learning_rate": 2.822039220212064e-07,
+      "loss": 0.3196,
+      "step": 755
+    },
+    {
+      "epoch": 2.848030018761726,
+      "grad_norm": 1.2466336488723755,
+      "learning_rate": 2.2549725411822485e-07,
+      "loss": 0.287,
+      "step": 760
+    },
+    {
+      "epoch": 2.8667917448405253,
+      "grad_norm": 1.3161360025405884,
+      "learning_rate": 1.751017577216163e-07,
+      "loss": 0.3168,
+      "step": 765
+    },
+    {
+      "epoch": 2.8855534709193247,
+      "grad_norm": 1.4072085618972778,
+      "learning_rate": 1.3103896009537207e-07,
+      "loss": 0.2697,
+      "step": 770
+    },
+    {
+      "epoch": 2.904315196998124,
+      "grad_norm": 1.4452869892120361,
+      "learning_rate": 9.3327683387191e-08,
+      "loss": 0.2703,
+      "step": 775
+    },
+    {
+      "epoch": 2.9230769230769234,
+      "grad_norm": 1.1264808177947998,
+      "learning_rate": 6.198403658829233e-08,
+      "loss": 0.3136,
+      "step": 780
+    },
+    {
+      "epoch": 2.9418386491557222,
+      "grad_norm": 1.240157127380371,
+      "learning_rate": 3.702140865220027e-08,
+      "loss": 0.2867,
+      "step": 785
+    },
+    {
+      "epoch": 2.9606003752345216,
+      "grad_norm": 1.138003945350647,
+      "learning_rate": 1.8450462775428946e-08,
+      "loss": 0.3222,
+      "step": 790
+    },
+    {
+      "epoch": 2.979362101313321,
+      "grad_norm": 1.1305983066558838,
+      "learning_rate": 6.279131842517605e-09,
+      "loss": 0.3197,
+      "step": 795
+    },
+    {
+      "epoch": 2.99812382739212,
+      "grad_norm": 1.1317766904830933,
+      "learning_rate": 5.126150373813144e-10,
+      "loss": 0.2892,
+      "step": 800
+    },
+    {
+      "epoch": 3.0,
+      "step": 801,
+      "total_flos": 1.127223997277143e+18,
+      "train_loss": 0.6140232465500838,
+      "train_runtime": 553.9891,
+      "train_samples_per_second": 46.154,
+      "train_steps_per_second": 1.446
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 801,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.127223997277143e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

10_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91ee83d5c9cc0a7206a6d62b36469ec495e2010690bcecb059d3876e88728040
+size 8209

10_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff