shujatoor commited on May 16, 2024

Commit

10e70c5

verified ·

1 Parent(s): 91fe168

End of training

Browse files

Files changed (17) hide show

README.md +71 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
added_tokens.json +13 -0
all_results.json +13 -0
eval_results.json +8 -0
runs/May16_13-51-04_imran-Precision-Tower-7910/events.out.tfevents.1715849498.imran-Precision-Tower-7910.3233262.0 +3 -0
runs/May16_13-54-13_imran-Precision-Tower-7910/events.out.tfevents.1715849706.imran-Precision-Tower-7910.3233683.0 +3 -0
runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715850899.imran-Precision-Tower-7910.3234996.0 +3 -0
runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715860793.imran-Precision-Tower-7910.3234996.1 +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +129 -0
train_results.json +8 -0
trainer_state.json +1660 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+license: mit
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: microsoft/Phi-3-mini-4k-instruct
+datasets:
+- generator
+model-index:
+- name: checkpoint_update
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# checkpoint_update
+This model is a fine-tuned version of [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.9356
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 0
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.2
+- num_epochs: 5
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.1904        | 0.5618 | 500  | 1.0617          |
+| 0.765         | 1.1236 | 1000 | 0.9442          |
+| 0.782         | 1.6854 | 1500 | 0.8690          |
+| 0.5591        | 2.2472 | 2000 | 0.8647          |
+| 0.5669        | 2.8090 | 2500 | 0.8296          |
+| 0.4205        | 3.3708 | 3000 | 0.8820          |
+| 0.3812        | 3.9326 | 3500 | 0.8859          |
+| 0.3323        | 4.4944 | 4000 | 0.9360          |
+### Framework versions
+- PEFT 0.10.1.dev0
+- Transformers 4.41.0.dev0
+- Pytorch 2.2.1+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "qkv_proj",
+    "gate_up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:845b0545b68b4d44ef779d00ef1c452c0d2a81c3d9c81962460104323fad4cbd
+size 50366280

added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 0.9356008172035217,
+    "eval_runtime": 207.2602,
+    "eval_samples": 506,
+    "eval_samples_per_second": 1.848,
+    "eval_steps_per_second": 1.848,
+    "total_flos": 1.024663401529344e+17,
+    "train_loss": 0.6764815047617708,
+    "train_runtime": 9686.8536,
+    "train_samples_per_second": 0.459,
+    "train_steps_per_second": 0.459
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 0.9356008172035217,
+    "eval_runtime": 207.2602,
+    "eval_samples": 506,
+    "eval_samples_per_second": 1.848,
+    "eval_steps_per_second": 1.848
+}

runs/May16_13-51-04_imran-Precision-Tower-7910/events.out.tfevents.1715849498.imran-Precision-Tower-7910.3233262.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c38f5ee0cc01916c4adab2c740cffc492855f3a9cb1c9b5355e3a95836daa633
+size 6400

runs/May16_13-54-13_imran-Precision-Tower-7910/events.out.tfevents.1715849706.imran-Precision-Tower-7910.3233683.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4e07ee582fc3c3adc049ededfc5123be9cf9037f1b99667a6f0b3dc88c3396b
+size 9565

runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715850899.imran-Precision-Tower-7910.3234996.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab38a24e98ccf5ff44589b747bd06023f692af602f1111d8b7fe3b35fed2f345
+size 54498

runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715860793.imran-Precision-Tower-7910.3234996.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:277c4063efc10d9e723846f5becdddecca74a4705bd93f3adf159c8c2e71b8e3
+size 359

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1024,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 5.0,
+    "total_flos": 1.024663401529344e+17,
+    "train_loss": 0.6764815047617708,
+    "train_runtime": 9686.8536,
+    "train_samples_per_second": 0.459,
+    "train_steps_per_second": 0.459
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1660 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 4450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02247191011235955,
+      "grad_norm": 3.921875,
+      "learning_rate": 4.49438202247191e-06,
+      "loss": 2.2098,
+      "step": 20
+    },
+    {
+      "epoch": 0.0449438202247191,
+      "grad_norm": 2.03125,
+      "learning_rate": 8.98876404494382e-06,
+      "loss": 2.1052,
+      "step": 40
+    },
+    {
+      "epoch": 0.06741573033707865,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.348314606741573e-05,
+      "loss": 2.1605,
+      "step": 60
+    },
+    {
+      "epoch": 0.0898876404494382,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.797752808988764e-05,
+      "loss": 1.8331,
+      "step": 80
+    },
+    {
+      "epoch": 0.11235955056179775,
+      "grad_norm": 0.61328125,
+      "learning_rate": 2.2471910112359552e-05,
+      "loss": 1.8186,
+      "step": 100
+    },
+    {
+      "epoch": 0.1348314606741573,
+      "grad_norm": 0.703125,
+      "learning_rate": 2.696629213483146e-05,
+      "loss": 1.5533,
+      "step": 120
+    },
+    {
+      "epoch": 0.15730337078651685,
+      "grad_norm": 0.4921875,
+      "learning_rate": 3.1460674157303374e-05,
+      "loss": 1.5419,
+      "step": 140
+    },
+    {
+      "epoch": 0.1797752808988764,
+      "grad_norm": 0.470703125,
+      "learning_rate": 3.595505617977528e-05,
+      "loss": 1.3374,
+      "step": 160
+    },
+    {
+      "epoch": 0.20224719101123595,
+      "grad_norm": 0.52734375,
+      "learning_rate": 4.044943820224719e-05,
+      "loss": 1.3418,
+      "step": 180
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.59375,
+      "learning_rate": 4.4943820224719104e-05,
+      "loss": 1.3921,
+      "step": 200
+    },
+    {
+      "epoch": 0.24719101123595505,
+      "grad_norm": 0.67578125,
+      "learning_rate": 4.943820224719101e-05,
+      "loss": 1.1401,
+      "step": 220
+    },
+    {
+      "epoch": 0.2696629213483146,
+      "grad_norm": 0.72265625,
+      "learning_rate": 5.393258426966292e-05,
+      "loss": 1.2139,
+      "step": 240
+    },
+    {
+      "epoch": 0.29213483146067415,
+      "grad_norm": 0.49609375,
+      "learning_rate": 5.8426966292134835e-05,
+      "loss": 1.171,
+      "step": 260
+    },
+    {
+      "epoch": 0.3146067415730337,
+      "grad_norm": 0.94921875,
+      "learning_rate": 6.292134831460675e-05,
+      "loss": 1.1424,
+      "step": 280
+    },
+    {
+      "epoch": 0.33707865168539325,
+      "grad_norm": 0.8046875,
+      "learning_rate": 6.741573033707866e-05,
+      "loss": 1.2171,
+      "step": 300
+    },
+    {
+      "epoch": 0.3595505617977528,
+      "grad_norm": 1.09375,
+      "learning_rate": 7.191011235955056e-05,
+      "loss": 1.1575,
+      "step": 320
+    },
+    {
+      "epoch": 0.38202247191011235,
+      "grad_norm": 1.4453125,
+      "learning_rate": 7.640449438202247e-05,
+      "loss": 1.2041,
+      "step": 340
+    },
+    {
+      "epoch": 0.4044943820224719,
+      "grad_norm": 1.0,
+      "learning_rate": 8.089887640449438e-05,
+      "loss": 1.106,
+      "step": 360
+    },
+    {
+      "epoch": 0.42696629213483145,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.53932584269663e-05,
+      "loss": 1.0728,
+      "step": 380
+    },
+    {
+      "epoch": 0.449438202247191,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8.988764044943821e-05,
+      "loss": 0.9622,
+      "step": 400
+    },
+    {
+      "epoch": 0.47191011235955055,
+      "grad_norm": 0.99609375,
+      "learning_rate": 9.438202247191012e-05,
+      "loss": 1.0835,
+      "step": 420
+    },
+    {
+      "epoch": 0.4943820224719101,
+      "grad_norm": 0.8828125,
+      "learning_rate": 9.887640449438202e-05,
+      "loss": 1.0557,
+      "step": 440
+    },
+    {
+      "epoch": 0.5168539325842697,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010337078651685395,
+      "loss": 1.1037,
+      "step": 460
+    },
+    {
+      "epoch": 0.5393258426966292,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00010786516853932584,
+      "loss": 1.058,
+      "step": 480
+    },
+    {
+      "epoch": 0.5617977528089888,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011235955056179777,
+      "loss": 1.1904,
+      "step": 500
+    },
+    {
+      "epoch": 0.5617977528089888,
+      "eval_loss": 1.0617437362670898,
+      "eval_runtime": 206.3616,
+      "eval_samples_per_second": 1.856,
+      "eval_steps_per_second": 1.856,
+      "step": 500
+    },
+    {
+      "epoch": 0.5842696629213483,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.00011685393258426967,
+      "loss": 1.1192,
+      "step": 520
+    },
+    {
+      "epoch": 0.6067415730337079,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00012134831460674158,
+      "loss": 1.0356,
+      "step": 540
+    },
+    {
+      "epoch": 0.6292134831460674,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0001258426966292135,
+      "loss": 1.0108,
+      "step": 560
+    },
+    {
+      "epoch": 0.651685393258427,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0001303370786516854,
+      "loss": 0.8835,
+      "step": 580
+    },
+    {
+      "epoch": 0.6741573033707865,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00013483146067415732,
+      "loss": 0.9925,
+      "step": 600
+    },
+    {
+      "epoch": 0.6966292134831461,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00013932584269662923,
+      "loss": 0.9548,
+      "step": 620
+    },
+    {
+      "epoch": 0.7191011235955056,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00014382022471910112,
+      "loss": 1.113,
+      "step": 640
+    },
+    {
+      "epoch": 0.7415730337078652,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00014831460674157306,
+      "loss": 0.9504,
+      "step": 660
+    },
+    {
+      "epoch": 0.7640449438202247,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00015280898876404494,
+      "loss": 1.1266,
+      "step": 680
+    },
+    {
+      "epoch": 0.7865168539325843,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00015730337078651685,
+      "loss": 1.0707,
+      "step": 700
+    },
+    {
+      "epoch": 0.8089887640449438,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00016179775280898877,
+      "loss": 0.9222,
+      "step": 720
+    },
+    {
+      "epoch": 0.8314606741573034,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00016629213483146068,
+      "loss": 1.1053,
+      "step": 740
+    },
+    {
+      "epoch": 0.8539325842696629,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0001707865168539326,
+      "loss": 1.25,
+      "step": 760
+    },
+    {
+      "epoch": 0.8764044943820225,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001752808988764045,
+      "loss": 0.933,
+      "step": 780
+    },
+    {
+      "epoch": 0.898876404494382,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.00017977528089887642,
+      "loss": 0.885,
+      "step": 800
+    },
+    {
+      "epoch": 0.9213483146067416,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00018426966292134833,
+      "loss": 0.9786,
+      "step": 820
+    },
+    {
+      "epoch": 0.9438202247191011,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00018876404494382024,
+      "loss": 0.9101,
+      "step": 840
+    },
+    {
+      "epoch": 0.9662921348314607,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00019325842696629215,
+      "loss": 0.9005,
+      "step": 860
+    },
+    {
+      "epoch": 0.9887640449438202,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00019775280898876404,
+      "loss": 0.9485,
+      "step": 880
+    },
+    {
+      "epoch": 1.0112359550561798,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00019999610626011892,
+      "loss": 1.0071,
+      "step": 900
+    },
+    {
+      "epoch": 1.0337078651685394,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00019996495816039186,
+      "loss": 0.9138,
+      "step": 920
+    },
+    {
+      "epoch": 1.0561797752808988,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.00019990267166335664,
+      "loss": 0.7752,
+      "step": 940
+    },
+    {
+      "epoch": 1.0786516853932584,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00019980926617082901,
+      "loss": 0.9331,
+      "step": 960
+    },
+    {
+      "epoch": 1.101123595505618,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00019968477077797781,
+      "loss": 1.0037,
+      "step": 980
+    },
+    {
+      "epoch": 1.1235955056179776,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00019952922426426207,
+      "loss": 0.765,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1235955056179776,
+      "eval_loss": 0.944207489490509,
+      "eval_runtime": 206.3524,
+      "eval_samples_per_second": 1.856,
+      "eval_steps_per_second": 1.856,
+      "step": 1000
+    },
+    {
+      "epoch": 1.146067415730337,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00019934267508135164,
+      "loss": 0.861,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1685393258426966,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00019912518133803465,
+      "loss": 0.8251,
+      "step": 1040
+    },
+    {
+      "epoch": 1.1910112359550562,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.00019887681078211707,
+      "loss": 0.9779,
+      "step": 1060
+    },
+    {
+      "epoch": 1.2134831460674158,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.00019859764077931978,
+      "loss": 0.8112,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2359550561797752,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00019828775828917964,
+      "loss": 0.9084,
+      "step": 1100
+    },
+    {
+      "epoch": 1.2584269662921348,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00019794725983796218,
+      "loss": 0.8429,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2808988764044944,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00019757625148859441,
+      "loss": 0.8029,
+      "step": 1140
+    },
+    {
+      "epoch": 1.303370786516854,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00019717484880762685,
+      "loss": 0.9478,
+      "step": 1160
+    },
+    {
+      "epoch": 1.3258426966292136,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00019674317682923532,
+      "loss": 0.6985,
+      "step": 1180
+    },
+    {
+      "epoch": 1.348314606741573,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00019628137001627383,
+      "loss": 0.8653,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3707865168539326,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00019578957221839014,
+      "loss": 0.891,
+      "step": 1220
+    },
+    {
+      "epoch": 1.3932584269662922,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00019526793662721768,
+      "loss": 0.861,
+      "step": 1240
+    },
+    {
+      "epoch": 1.4157303370786516,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00019471662572865736,
+      "loss": 0.7591,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4382022471910112,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00019413581125226438,
+      "loss": 0.7109,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4606741573033708,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.00019352567411775565,
+      "loss": 0.8947,
+      "step": 1300
+    },
+    {
+      "epoch": 1.4831460674157304,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00019288640437865445,
+      "loss": 0.8514,
+      "step": 1320
+    },
+    {
+      "epoch": 1.50561797752809,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0001922182011630902,
+      "loss": 0.7379,
+      "step": 1340
+    },
+    {
+      "epoch": 1.5280898876404494,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.00019152127261177126,
+      "loss": 0.6778,
+      "step": 1360
+    },
+    {
+      "epoch": 1.550561797752809,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00019079583581315076,
+      "loss": 0.6591,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5730337078651684,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001900421167358048,
+      "loss": 0.8635,
+      "step": 1400
+    },
+    {
+      "epoch": 1.595505617977528,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00018926035015804488,
+      "loss": 0.924,
+      "step": 1420
+    },
+    {
+      "epoch": 1.6179775280898876,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018845077959478613,
+      "loss": 0.8554,
+      "step": 1440
+    },
+    {
+      "epoch": 1.6404494382022472,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00018761365722169403,
+      "loss": 0.9471,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6629213483146068,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.00018674924379663338,
+      "loss": 0.9187,
+      "step": 1480
+    },
+    {
+      "epoch": 1.6853932584269664,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00018585780857844418,
+      "loss": 0.782,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6853932584269664,
+      "eval_loss": 0.8689672350883484,
+      "eval_runtime": 206.6921,
+      "eval_samples_per_second": 1.853,
+      "eval_steps_per_second": 1.853,
+      "step": 1500
+    },
+    {
+      "epoch": 1.7078651685393258,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00018493962924306912,
+      "loss": 0.8983,
+      "step": 1520
+    },
+    {
+      "epoch": 1.7303370786516854,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0001839949917970596,
+      "loss": 0.5218,
+      "step": 1540
+    },
+    {
+      "epoch": 1.7528089887640448,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00018302419048848667,
+      "loss": 0.6711,
+      "step": 1560
+    },
+    {
+      "epoch": 1.7752808988764044,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0001820275277152846,
+      "loss": 0.7932,
+      "step": 1580
+    },
+    {
+      "epoch": 1.797752808988764,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00018100531393105623,
+      "loss": 0.7181,
+      "step": 1600
+    },
+    {
+      "epoch": 1.8202247191011236,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00017995786754836863,
+      "loss": 0.8525,
+      "step": 1620
+    },
+    {
+      "epoch": 1.8426966292134832,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.00017888551483956987,
+      "loss": 0.6968,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8651685393258428,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00017778858983515743,
+      "loss": 0.902,
+      "step": 1660
+    },
+    {
+      "epoch": 1.8876404494382022,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00017666743421972987,
+      "loss": 0.954,
+      "step": 1680
+    },
+    {
+      "epoch": 1.9101123595505618,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001755223972255546,
+      "loss": 0.791,
+      "step": 1700
+    },
+    {
+      "epoch": 1.9325842696629212,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.00017435383552378428,
+      "loss": 0.77,
+      "step": 1720
+    },
+    {
+      "epoch": 1.9550561797752808,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0001731621131133564,
+      "loss": 0.6294,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9775280898876404,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.00017194760120760986,
+      "loss": 0.7982,
+      "step": 1760
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7643,
+      "step": 1780
+    },
+    {
+      "epoch": 2.0224719101123596,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001694517291395307,
+      "loss": 0.5279,
+      "step": 1800
+    },
+    {
+      "epoch": 2.044943820224719,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00016817114642419067,
+      "loss": 0.6667,
+      "step": 1820
+    },
+    {
+      "epoch": 2.067415730337079,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00016686932886534781,
+      "loss": 0.6427,
+      "step": 1840
+    },
+    {
+      "epoch": 2.0898876404494384,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00016554668197022295,
+      "loss": 0.633,
+      "step": 1860
+    },
+    {
+      "epoch": 2.1123595505617976,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.00016420361773423204,
+      "loss": 0.5623,
+      "step": 1880
+    },
+    {
+      "epoch": 2.134831460674157,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00016284055451265246,
+      "loss": 0.6311,
+      "step": 1900
+    },
+    {
+      "epoch": 2.157303370786517,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00016145791689030795,
+      "loss": 0.7469,
+      "step": 1920
+    },
+    {
+      "epoch": 2.1797752808988764,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001600561355493137,
+      "loss": 0.7196,
+      "step": 1940
+    },
+    {
+      "epoch": 2.202247191011236,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0001586356471349215,
+      "loss": 0.6328,
+      "step": 1960
+    },
+    {
+      "epoch": 2.2247191011235956,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00015719689411950808,
+      "loss": 0.6349,
+      "step": 1980
+    },
+    {
+      "epoch": 2.247191011235955,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00015574032466474775,
+      "loss": 0.5591,
+      "step": 2000
+    },
+    {
+      "epoch": 2.247191011235955,
+      "eval_loss": 0.8647096753120422,
+      "eval_runtime": 206.6878,
+      "eval_samples_per_second": 1.853,
+      "eval_steps_per_second": 1.853,
+      "step": 2000
+    },
+    {
+      "epoch": 2.2696629213483144,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00015426639248201313,
+      "loss": 0.5206,
+      "step": 2020
+    },
+    {
+      "epoch": 2.292134831460674,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0001527755566910474,
+      "loss": 0.7186,
+      "step": 2040
+    },
+    {
+      "epoch": 2.3146067415730336,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00015126828167695146,
+      "loss": 0.6533,
+      "step": 2060
+    },
+    {
+      "epoch": 2.337078651685393,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0001497450369455312,
+      "loss": 0.6324,
+      "step": 2080
+    },
+    {
+      "epoch": 2.359550561797753,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00014820629697704965,
+      "loss": 0.5276,
+      "step": 2100
+    },
+    {
+      "epoch": 2.3820224719101124,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.00014665254107842964,
+      "loss": 0.612,
+      "step": 2120
+    },
+    {
+      "epoch": 2.404494382022472,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00014508425323395317,
+      "loss": 0.614,
+      "step": 2140
+    },
+    {
+      "epoch": 2.4269662921348316,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0001435019219545034,
+      "loss": 0.4988,
+      "step": 2160
+    },
+    {
+      "epoch": 2.449438202247191,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00014190604012539684,
+      "loss": 0.6777,
+      "step": 2180
+    },
+    {
+      "epoch": 2.4719101123595504,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00014029710485285324,
+      "loss": 0.662,
+      "step": 2200
+    },
+    {
+      "epoch": 2.49438202247191,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.00013867561730915016,
+      "loss": 0.6087,
+      "step": 2220
+    },
+    {
+      "epoch": 2.5168539325842696,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0001370420825765114,
+      "loss": 0.56,
+      "step": 2240
+    },
+    {
+      "epoch": 2.539325842696629,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00013539700948977717,
+      "loss": 0.572,
+      "step": 2260
+    },
+    {
+      "epoch": 2.561797752808989,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00013374091047790585,
+      "loss": 0.7334,
+      "step": 2280
+    },
+    {
+      "epoch": 2.5842696629213484,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00013207430140435556,
+      "loss": 0.5377,
+      "step": 2300
+    },
+    {
+      "epoch": 2.606741573033708,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00013039770140639654,
+      "loss": 0.6306,
+      "step": 2320
+    },
+    {
+      "epoch": 2.629213483146067,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00012871163273340307,
+      "loss": 0.582,
+      "step": 2340
+    },
+    {
+      "epoch": 2.6516853932584272,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00012701662058417688,
+      "loss": 0.6326,
+      "step": 2360
+    },
+    {
+      "epoch": 2.6741573033707864,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00012531319294335086,
+      "loss": 0.6907,
+      "step": 2380
+    },
+    {
+      "epoch": 2.696629213483146,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00012360188041692582,
+      "loss": 0.656,
+      "step": 2400
+    },
+    {
+      "epoch": 2.7191011235955056,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00012188321606699016,
+      "loss": 0.5817,
+      "step": 2420
+    },
+    {
+      "epoch": 2.741573033707865,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00012015773524567479,
+      "loss": 0.5046,
+      "step": 2440
+    },
+    {
+      "epoch": 2.764044943820225,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00011842597542839462,
+      "loss": 0.6293,
+      "step": 2460
+    },
+    {
+      "epoch": 2.7865168539325844,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011668847604642861,
+      "loss": 0.6067,
+      "step": 2480
+    },
+    {
+      "epoch": 2.808988764044944,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011494577831889067,
+      "loss": 0.5669,
+      "step": 2500
+    },
+    {
+      "epoch": 2.808988764044944,
+      "eval_loss": 0.8295581340789795,
+      "eval_runtime": 206.704,
+      "eval_samples_per_second": 1.853,
+      "eval_steps_per_second": 1.853,
+      "step": 2500
+    },
+    {
+      "epoch": 2.831460674157303,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011319842508414365,
+      "loss": 0.5429,
+      "step": 2520
+    },
+    {
+      "epoch": 2.853932584269663,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011144696063070883,
+      "loss": 0.5481,
+      "step": 2540
+    },
+    {
+      "epoch": 2.8764044943820224,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00010969193052772396,
+      "loss": 0.5308,
+      "step": 2560
+    },
+    {
+      "epoch": 2.898876404494382,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.00010793388145500198,
+      "loss": 0.4527,
+      "step": 2580
+    },
+    {
+      "epoch": 2.9213483146067416,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00010617336103274424,
+      "loss": 0.5333,
+      "step": 2600
+    },
+    {
+      "epoch": 2.943820224719101,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010441091765096047,
+      "loss": 0.5886,
+      "step": 2620
+    },
+    {
+      "epoch": 2.966292134831461,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0001026471002986491,
+      "loss": 0.626,
+      "step": 2640
+    },
+    {
+      "epoch": 2.98876404494382,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010088245839279082,
+      "loss": 0.6703,
+      "step": 2660
+    },
+    {
+      "epoch": 3.0112359550561796,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.911754160720923e-05,
+      "loss": 0.4819,
+      "step": 2680
+    },
+    {
+      "epoch": 3.033707865168539,
+      "grad_norm": 0.71875,
+      "learning_rate": 9.735289970135095e-05,
+      "loss": 0.4379,
+      "step": 2700
+    },
+    {
+      "epoch": 3.056179775280899,
+      "grad_norm": 1.7734375,
+      "learning_rate": 9.558908234903954e-05,
+      "loss": 0.3811,
+      "step": 2720
+    },
+    {
+      "epoch": 3.0786516853932584,
+      "grad_norm": 0.6953125,
+      "learning_rate": 9.382663896725578e-05,
+      "loss": 0.3855,
+      "step": 2740
+    },
+    {
+      "epoch": 3.101123595505618,
+      "grad_norm": 1.4140625,
+      "learning_rate": 9.206611854499805e-05,
+      "loss": 0.4749,
+      "step": 2760
+    },
+    {
+      "epoch": 3.1235955056179776,
+      "grad_norm": 0.609375,
+      "learning_rate": 9.030806947227607e-05,
+      "loss": 0.501,
+      "step": 2780
+    },
+    {
+      "epoch": 3.146067415730337,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.855303936929117e-05,
+      "loss": 0.4239,
+      "step": 2800
+    },
+    {
+      "epoch": 3.168539325842697,
+      "grad_norm": 0.84375,
+      "learning_rate": 8.680157491585636e-05,
+      "loss": 0.5388,
+      "step": 2820
+    },
+    {
+      "epoch": 3.191011235955056,
+      "grad_norm": 1.2578125,
+      "learning_rate": 8.505422168110934e-05,
+      "loss": 0.3715,
+      "step": 2840
+    },
+    {
+      "epoch": 3.2134831460674156,
+      "grad_norm": 0.94921875,
+      "learning_rate": 8.331152395357141e-05,
+      "loss": 0.4274,
+      "step": 2860
+    },
+    {
+      "epoch": 3.235955056179775,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.157402457160539e-05,
+      "loss": 0.4368,
+      "step": 2880
+    },
+    {
+      "epoch": 3.258426966292135,
+      "grad_norm": 1.2421875,
+      "learning_rate": 7.984226475432522e-05,
+      "loss": 0.4026,
+      "step": 2900
+    },
+    {
+      "epoch": 3.2808988764044944,
+      "grad_norm": 0.8359375,
+      "learning_rate": 7.811678393300987e-05,
+      "loss": 0.3971,
+      "step": 2920
+    },
+    {
+      "epoch": 3.303370786516854,
+      "grad_norm": 0.94921875,
+      "learning_rate": 7.63981195830742e-05,
+      "loss": 0.395,
+      "step": 2940
+    },
+    {
+      "epoch": 3.3258426966292136,
+      "grad_norm": 1.328125,
+      "learning_rate": 7.468680705664914e-05,
+      "loss": 0.4165,
+      "step": 2960
+    },
+    {
+      "epoch": 3.348314606741573,
+      "grad_norm": 0.88671875,
+      "learning_rate": 7.298337941582314e-05,
+      "loss": 0.4071,
+      "step": 2980
+    },
+    {
+      "epoch": 3.370786516853933,
+      "grad_norm": 1.4296875,
+      "learning_rate": 7.128836726659696e-05,
+      "loss": 0.4205,
+      "step": 3000
+    },
+    {
+      "epoch": 3.370786516853933,
+      "eval_loss": 0.8820343613624573,
+      "eval_runtime": 206.8369,
+      "eval_samples_per_second": 1.852,
+      "eval_steps_per_second": 1.852,
+      "step": 3000
+    },
+    {
+      "epoch": 3.393258426966292,
+      "grad_norm": 1.46875,
+      "learning_rate": 6.960229859360353e-05,
+      "loss": 0.3759,
+      "step": 3020
+    },
+    {
+      "epoch": 3.4157303370786516,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.792569859564445e-05,
+      "loss": 0.4457,
+      "step": 3040
+    },
+    {
+      "epoch": 3.438202247191011,
+      "grad_norm": 1.5390625,
+      "learning_rate": 6.625908952209418e-05,
+      "loss": 0.4088,
+      "step": 3060
+    },
+    {
+      "epoch": 3.460674157303371,
+      "grad_norm": 0.92578125,
+      "learning_rate": 6.460299051022285e-05,
+      "loss": 0.4221,
+      "step": 3080
+    },
+    {
+      "epoch": 3.4831460674157304,
+      "grad_norm": 1.34375,
+      "learning_rate": 6.295791742348865e-05,
+      "loss": 0.4304,
+      "step": 3100
+    },
+    {
+      "epoch": 3.50561797752809,
+      "grad_norm": 1.0,
+      "learning_rate": 6.132438269084985e-05,
+      "loss": 0.3612,
+      "step": 3120
+    },
+    {
+      "epoch": 3.5280898876404496,
+      "grad_norm": 1.4609375,
+      "learning_rate": 5.970289514714677e-05,
+      "loss": 0.4692,
+      "step": 3140
+    },
+    {
+      "epoch": 3.550561797752809,
+      "grad_norm": 1.578125,
+      "learning_rate": 5.8093959874603176e-05,
+      "loss": 0.4579,
+      "step": 3160
+    },
+    {
+      "epoch": 3.5730337078651684,
+      "grad_norm": 0.66796875,
+      "learning_rate": 5.649807804549663e-05,
+      "loss": 0.3754,
+      "step": 3180
+    },
+    {
+      "epoch": 3.595505617977528,
+      "grad_norm": 1.1328125,
+      "learning_rate": 5.491574676604682e-05,
+      "loss": 0.3685,
+      "step": 3200
+    },
+    {
+      "epoch": 3.6179775280898876,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.334745892157035e-05,
+      "loss": 0.3809,
+      "step": 3220
+    },
+    {
+      "epoch": 3.640449438202247,
+      "grad_norm": 1.4453125,
+      "learning_rate": 5.179370302295037e-05,
+      "loss": 0.4809,
+      "step": 3240
+    },
+    {
+      "epoch": 3.662921348314607,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.02549630544688e-05,
+      "loss": 0.3798,
+      "step": 3260
+    },
+    {
+      "epoch": 3.6853932584269664,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.8731718323048516e-05,
+      "loss": 0.4153,
+      "step": 3280
+    },
+    {
+      "epoch": 3.7078651685393256,
+      "grad_norm": 0.98046875,
+      "learning_rate": 4.722444330895256e-05,
+      "loss": 0.4612,
+      "step": 3300
+    },
+    {
+      "epoch": 3.7303370786516856,
+      "grad_norm": 1.4921875,
+      "learning_rate": 4.573360751798689e-05,
+      "loss": 0.469,
+      "step": 3320
+    },
+    {
+      "epoch": 3.752808988764045,
+      "grad_norm": 1.8203125,
+      "learning_rate": 4.425967533525229e-05,
+      "loss": 0.4523,
+      "step": 3340
+    },
+    {
+      "epoch": 3.7752808988764044,
+      "grad_norm": 1.359375,
+      "learning_rate": 4.2803105880491925e-05,
+      "loss": 0.4214,
+      "step": 3360
+    },
+    {
+      "epoch": 3.797752808988764,
+      "grad_norm": 1.734375,
+      "learning_rate": 4.136435286507849e-05,
+      "loss": 0.4981,
+      "step": 3380
+    },
+    {
+      "epoch": 3.8202247191011236,
+      "grad_norm": 1.3515625,
+      "learning_rate": 3.994386445068632e-05,
+      "loss": 0.4029,
+      "step": 3400
+    },
+    {
+      "epoch": 3.842696629213483,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.854208310969204e-05,
+      "loss": 0.3747,
+      "step": 3420
+    },
+    {
+      "epoch": 3.865168539325843,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.715944548734755e-05,
+      "loss": 0.4113,
+      "step": 3440
+    },
+    {
+      "epoch": 3.8876404494382024,
+      "grad_norm": 0.52734375,
+      "learning_rate": 3.5796382265767937e-05,
+      "loss": 0.3896,
+      "step": 3460
+    },
+    {
+      "epoch": 3.9101123595505616,
+      "grad_norm": 1.0625,
+      "learning_rate": 3.445331802977709e-05,
+      "loss": 0.4709,
+      "step": 3480
+    },
+    {
+      "epoch": 3.932584269662921,
+      "grad_norm": 1.4296875,
+      "learning_rate": 3.313067113465222e-05,
+      "loss": 0.3812,
+      "step": 3500
+    },
+    {
+      "epoch": 3.932584269662921,
+      "eval_loss": 0.8859002590179443,
+      "eval_runtime": 206.8365,
+      "eval_samples_per_second": 1.852,
+      "eval_steps_per_second": 1.852,
+      "step": 3500
+    },
+    {
+      "epoch": 3.955056179775281,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.182885357580934e-05,
+      "loss": 0.3906,
+      "step": 3520
+    },
+    {
+      "epoch": 3.9775280898876404,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.054827086046931e-05,
+      "loss": 0.3987,
+      "step": 3540
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.671875,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.3499,
+      "step": 3560
+    },
+    {
+      "epoch": 4.022471910112359,
+      "grad_norm": 0.79296875,
+      "learning_rate": 2.8052398792390154e-05,
+      "loss": 0.3292,
+      "step": 3580
+    },
+    {
+      "epoch": 4.044943820224719,
+      "grad_norm": 0.8125,
+      "learning_rate": 2.6837886886643614e-05,
+      "loss": 0.3343,
+      "step": 3600
+    },
+    {
+      "epoch": 4.067415730337078,
+      "grad_norm": 0.66015625,
+      "learning_rate": 2.5646164476215716e-05,
+      "loss": 0.3236,
+      "step": 3620
+    },
+    {
+      "epoch": 4.089887640449438,
+      "grad_norm": 1.25,
+      "learning_rate": 2.447760277444543e-05,
+      "loss": 0.2892,
+      "step": 3640
+    },
+    {
+      "epoch": 4.112359550561798,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.3332565780270165e-05,
+      "loss": 0.2801,
+      "step": 3660
+    },
+    {
+      "epoch": 4.134831460674158,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.2211410164842606e-05,
+      "loss": 0.3082,
+      "step": 3680
+    },
+    {
+      "epoch": 4.157303370786517,
+      "grad_norm": 1.46875,
+      "learning_rate": 2.1114485160430132e-05,
+      "loss": 0.3128,
+      "step": 3700
+    },
+    {
+      "epoch": 4.179775280898877,
+      "grad_norm": 0.8515625,
+      "learning_rate": 2.0042132451631378e-05,
+      "loss": 0.3846,
+      "step": 3720
+    },
+    {
+      "epoch": 4.202247191011236,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.899468606894379e-05,
+      "loss": 0.2718,
+      "step": 3740
+    },
+    {
+      "epoch": 4.224719101123595,
+      "grad_norm": 0.80859375,
+      "learning_rate": 1.7972472284715415e-05,
+      "loss": 0.302,
+      "step": 3760
+    },
+    {
+      "epoch": 4.247191011235955,
+      "grad_norm": 1.6953125,
+      "learning_rate": 1.6975809511513353e-05,
+      "loss": 0.3785,
+      "step": 3780
+    },
+    {
+      "epoch": 4.269662921348314,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.600500820294041e-05,
+      "loss": 0.3845,
+      "step": 3800
+    },
+    {
+      "epoch": 4.292134831460674,
+      "grad_norm": 1.6796875,
+      "learning_rate": 1.5060370756930919e-05,
+      "loss": 0.327,
+      "step": 3820
+    },
+    {
+      "epoch": 4.314606741573034,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.414219142155585e-05,
+      "loss": 0.3589,
+      "step": 3840
+    },
+    {
+      "epoch": 4.337078651685394,
+      "grad_norm": 0.75,
+      "learning_rate": 1.3250756203366632e-05,
+      "loss": 0.4057,
+      "step": 3860
+    },
+    {
+      "epoch": 4.359550561797753,
+      "grad_norm": 0.96484375,
+      "learning_rate": 1.2386342778305993e-05,
+      "loss": 0.3862,
+      "step": 3880
+    },
+    {
+      "epoch": 4.382022471910112,
+      "grad_norm": 1.4921875,
+      "learning_rate": 1.1549220405213878e-05,
+      "loss": 0.3319,
+      "step": 3900
+    },
+    {
+      "epoch": 4.404494382022472,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.0739649841955136e-05,
+      "loss": 0.2832,
+      "step": 3920
+    },
+    {
+      "epoch": 4.426966292134831,
+      "grad_norm": 1.234375,
+      "learning_rate": 9.957883264195223e-06,
+      "loss": 0.2732,
+      "step": 3940
+    },
+    {
+      "epoch": 4.449438202247191,
+      "grad_norm": 0.828125,
+      "learning_rate": 9.20416418684924e-06,
+      "loss": 0.2587,
+      "step": 3960
+    },
+    {
+      "epoch": 4.47191011235955,
+      "grad_norm": 1.5234375,
+      "learning_rate": 8.478727388228735e-06,
+      "loss": 0.3469,
+      "step": 3980
+    },
+    {
+      "epoch": 4.49438202247191,
+      "grad_norm": 0.90234375,
+      "learning_rate": 7.781798836909826e-06,
+      "loss": 0.3323,
+      "step": 4000
+    },
+    {
+      "epoch": 4.49438202247191,
+      "eval_loss": 0.9360187649726868,
+      "eval_runtime": 206.9644,
+      "eval_samples_per_second": 1.851,
+      "eval_steps_per_second": 1.851,
+      "step": 4000
+    },
+    {
+      "epoch": 4.51685393258427,
+      "grad_norm": 2.046875,
+      "learning_rate": 7.11359562134557e-06,
+      "loss": 0.3441,
+      "step": 4020
+    },
+    {
+      "epoch": 4.539325842696629,
+      "grad_norm": 1.296875,
+      "learning_rate": 6.4743258822443695e-06,
+      "loss": 0.3196,
+      "step": 4040
+    },
+    {
+      "epoch": 4.561797752808989,
+      "grad_norm": 1.515625,
+      "learning_rate": 5.8641887477356215e-06,
+      "loss": 0.3226,
+      "step": 4060
+    },
+    {
+      "epoch": 4.584269662921348,
+      "grad_norm": 1.6875,
+      "learning_rate": 5.283374271342645e-06,
+      "loss": 0.2859,
+      "step": 4080
+    },
+    {
+      "epoch": 4.606741573033708,
+      "grad_norm": 0.87109375,
+      "learning_rate": 4.732063372782336e-06,
+      "loss": 0.3164,
+      "step": 4100
+    },
+    {
+      "epoch": 4.629213483146067,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.210427781609861e-06,
+      "loss": 0.3275,
+      "step": 4120
+    },
+    {
+      "epoch": 4.651685393258427,
+      "grad_norm": 1.25,
+      "learning_rate": 3.718629983726185e-06,
+      "loss": 0.367,
+      "step": 4140
+    },
+    {
+      "epoch": 4.674157303370786,
+      "grad_norm": 1.6796875,
+      "learning_rate": 3.256823170764689e-06,
+      "loss": 0.3445,
+      "step": 4160
+    },
+    {
+      "epoch": 4.696629213483146,
+      "grad_norm": 0.9453125,
+      "learning_rate": 2.8251511923731655e-06,
+      "loss": 0.4628,
+      "step": 4180
+    },
+    {
+      "epoch": 4.719101123595506,
+      "grad_norm": 0.8515625,
+      "learning_rate": 2.423748511405577e-06,
+      "loss": 0.3252,
+      "step": 4200
+    },
+    {
+      "epoch": 4.741573033707866,
+      "grad_norm": 0.90234375,
+      "learning_rate": 2.052740162037814e-06,
+      "loss": 0.3783,
+      "step": 4220
+    },
+    {
+      "epoch": 4.764044943820225,
+      "grad_norm": 0.7421875,
+      "learning_rate": 1.7122417108203726e-06,
+      "loss": 0.294,
+      "step": 4240
+    },
+    {
+      "epoch": 4.786516853932584,
+      "grad_norm": 0.75390625,
+      "learning_rate": 1.4023592206802382e-06,
+      "loss": 0.3194,
+      "step": 4260
+    },
+    {
+      "epoch": 4.808988764044944,
+      "grad_norm": 1.375,
+      "learning_rate": 1.1231892178829472e-06,
+      "loss": 0.3145,
+      "step": 4280
+    },
+    {
+      "epoch": 4.831460674157303,
+      "grad_norm": 1.4140625,
+      "learning_rate": 8.74818661965382e-07,
+      "loss": 0.3372,
+      "step": 4300
+    },
+    {
+      "epoch": 4.853932584269663,
+      "grad_norm": 1.6640625,
+      "learning_rate": 6.573249186483721e-07,
+      "loss": 0.2791,
+      "step": 4320
+    },
+    {
+      "epoch": 4.876404494382022,
+      "grad_norm": 0.8359375,
+      "learning_rate": 4.707757357379383e-07,
+      "loss": 0.2428,
+      "step": 4340
+    },
+    {
+      "epoch": 4.898876404494382,
+      "grad_norm": 1.3671875,
+      "learning_rate": 3.152292220222064e-07,
+      "loss": 0.3225,
+      "step": 4360
+    },
+    {
+      "epoch": 4.921348314606742,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.9073382917097483e-07,
+      "loss": 0.3164,
+      "step": 4380
+    },
+    {
+      "epoch": 4.943820224719101,
+      "grad_norm": 1.0546875,
+      "learning_rate": 9.732833664334307e-08,
+      "loss": 0.3571,
+      "step": 4400
+    },
+    {
+      "epoch": 4.966292134831461,
+      "grad_norm": 0.87890625,
+      "learning_rate": 3.5041839608151996e-08,
+      "loss": 0.3002,
+      "step": 4420
+    },
+    {
+      "epoch": 4.98876404494382,
+      "grad_norm": 0.6640625,
+      "learning_rate": 3.893739881088987e-09,
+      "loss": 0.366,
+      "step": 4440
+    },
+    {
+      "epoch": 5.0,
+      "step": 4450,
+      "total_flos": 1.024663401529344e+17,
+      "train_loss": 0.6764815047617708,
+      "train_runtime": 9686.8536,
+      "train_samples_per_second": 0.459,
+      "train_steps_per_second": 0.459
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 4450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.024663401529344e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7797c8c3afe93974491cdc40e75b1366f5c59ec6a806b3f23f3877c5601245bf
+size 5112