lemingshen commited on Mar 15, 2025

Commit

1ea676e

1 Parent(s): 39e15c2

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +6 -0
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
checkpoint-1000/README.md +202 -0
checkpoint-1000/adapter_config.json +29 -0
checkpoint-1000/adapter_model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +24 -0
checkpoint-1000/tokenizer.json +0 -0
checkpoint-1000/tokenizer.model +3 -0
checkpoint-1000/tokenizer_config.json +43 -0
checkpoint-1000/trainer_state.json +313 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-10000/README.md +202 -0
checkpoint-10000/adapter_config.json +29 -0
checkpoint-10000/adapter_model.safetensors +3 -0
checkpoint-10000/optimizer.pt +3 -0
checkpoint-10000/rng_state.pth +3 -0
checkpoint-10000/scheduler.pt +3 -0
checkpoint-10000/special_tokens_map.json +24 -0
checkpoint-10000/tokenizer.json +0 -0
checkpoint-10000/tokenizer.model +3 -0
checkpoint-10000/tokenizer_config.json +43 -0
checkpoint-10000/trainer_state.json +2833 -0
checkpoint-10000/training_args.bin +3 -0
checkpoint-13000/README.md +202 -0
checkpoint-13000/adapter_config.json +29 -0
checkpoint-13000/adapter_model.safetensors +3 -0
checkpoint-13000/optimizer.pt +3 -0
checkpoint-13000/rng_state.pth +3 -0
checkpoint-13000/scheduler.pt +3 -0
checkpoint-13000/special_tokens_map.json +24 -0
checkpoint-13000/tokenizer.json +0 -0
checkpoint-13000/tokenizer.model +3 -0
checkpoint-13000/tokenizer_config.json +43 -0
checkpoint-13000/trainer_state.json +3673 -0
checkpoint-13000/training_args.bin +3 -0
checkpoint-5000/README.md +202 -0
checkpoint-5000/adapter_config.json +29 -0
checkpoint-5000/adapter_model.safetensors +3 -0
checkpoint-5000/optimizer.pt +3 -0
checkpoint-5000/rng_state.pth +3 -0
checkpoint-5000/scheduler.pt +3 -0
checkpoint-5000/special_tokens_map.json +24 -0
checkpoint-5000/tokenizer.json +0 -0
checkpoint-5000/tokenizer.model +3 -0
checkpoint-5000/tokenizer_config.json +43 -0
checkpoint-5000/trainer_state.json +1433 -0

README.md CHANGED Viewed

@@ -1,3 +1,9 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+# GPIoT: Tailoring Small Language Models for IoT Program Synthesis and Development
+- Foundation model: Llama-2-13b-chat-hf
+- LoRA fine tuned with INT8 quantization
+### Feel free to choose any checkpoint

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+  "bias": "lora_only",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.001,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86d0ff6d67840efa0f3bb43f925e5e44fdf67a41a26a053e97062f777917fb67
+size 209736952

checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-13b-chat-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+  "bias": "lora_only",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.001,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8edefefbf568dc991704857cffabbca95b925eb0f62f3af863a93162d0a11bc6
+size 209736952

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b9a0ac956de7513e2a08d3f405bb39bc8b68bc7d1e4f700bde5671dbbd28187
+size 419529285

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:574d0133fbcb68f97b92c9639c0846141a3ee9ff99d17b5b92c2644afac8737d
+size 14575

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caa069f9a4a007a38bfc435840b395335c193bb1bf78f30a831ff387adb6205a
+size 627

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,313 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.22168033695411218,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005542008423852805,
+      "grad_norm": 0.07243233174085617,
+      "learning_rate": 1.2315270935960592e-05,
+      "loss": 1.4594,
+      "step": 25
+    },
+    {
+      "epoch": 0.01108401684770561,
+      "grad_norm": 0.40484485030174255,
+      "learning_rate": 2.4630541871921184e-05,
+      "loss": 2.2032,
+      "step": 50
+    },
+    {
+      "epoch": 0.016626025271558414,
+      "grad_norm": 0.06850667297840118,
+      "learning_rate": 3.694581280788178e-05,
+      "loss": 1.2931,
+      "step": 75
+    },
+    {
+      "epoch": 0.02216803369541122,
+      "grad_norm": 0.4395073354244232,
+      "learning_rate": 4.926108374384237e-05,
+      "loss": 1.5698,
+      "step": 100
+    },
+    {
+      "epoch": 0.027710042119264023,
+      "grad_norm": 0.077068030834198,
+      "learning_rate": 6.157635467980296e-05,
+      "loss": 1.0537,
+      "step": 125
+    },
+    {
+      "epoch": 0.03325205054311683,
+      "grad_norm": 0.3282291293144226,
+      "learning_rate": 7.389162561576355e-05,
+      "loss": 0.9749,
+      "step": 150
+    },
+    {
+      "epoch": 0.03879405896696963,
+      "grad_norm": 0.0593000203371048,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 0.9349,
+      "step": 175
+    },
+    {
+      "epoch": 0.04433606739082244,
+      "grad_norm": 0.25612473487854004,
+      "learning_rate": 9.852216748768474e-05,
+      "loss": 0.8974,
+      "step": 200
+    },
+    {
+      "epoch": 0.04987807581467524,
+      "grad_norm": 0.0757347121834755,
+      "learning_rate": 0.00011083743842364534,
+      "loss": 0.9081,
+      "step": 225
+    },
+    {
+      "epoch": 0.055420084238528046,
+      "grad_norm": 0.14145499467849731,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 0.8607,
+      "step": 250
+    },
+    {
+      "epoch": 0.06096209266238085,
+      "grad_norm": 0.07710155844688416,
+      "learning_rate": 0.00013546798029556652,
+      "loss": 0.8973,
+      "step": 275
+    },
+    {
+      "epoch": 0.06650410108623366,
+      "grad_norm": 0.14791467785835266,
+      "learning_rate": 0.0001477832512315271,
+      "loss": 0.7924,
+      "step": 300
+    },
+    {
+      "epoch": 0.07204610951008646,
+      "grad_norm": 0.07742594182491302,
+      "learning_rate": 0.00016009852216748767,
+      "loss": 0.8698,
+      "step": 325
+    },
+    {
+      "epoch": 0.07758811793393926,
+      "grad_norm": 0.14303487539291382,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.786,
+      "step": 350
+    },
+    {
+      "epoch": 0.08313012635779206,
+      "grad_norm": 0.0865108072757721,
+      "learning_rate": 0.00018472906403940888,
+      "loss": 0.8606,
+      "step": 375
+    },
+    {
+      "epoch": 0.08867213478164487,
+      "grad_norm": 0.7533164024353027,
+      "learning_rate": 0.00019704433497536947,
+      "loss": 0.807,
+      "step": 400
+    },
+    {
+      "epoch": 0.09421414320549767,
+      "grad_norm": 0.08325570821762085,
+      "learning_rate": 0.00019999896617927833,
+      "loss": 0.8635,
+      "step": 425
+    },
+    {
+      "epoch": 0.09975615162935048,
+      "grad_norm": 0.1043543666601181,
+      "learning_rate": 0.0001999944557842899,
+      "loss": 0.7825,
+      "step": 450
+    },
+    {
+      "epoch": 0.10529816005320328,
+      "grad_norm": 0.07949995994567871,
+      "learning_rate": 0.0001999863658806385,
+      "loss": 0.8379,
+      "step": 475
+    },
+    {
+      "epoch": 0.11084016847705609,
+      "grad_norm": 0.12020070850849152,
+      "learning_rate": 0.00019997469675791905,
+      "loss": 0.768,
+      "step": 500
+    },
+    {
+      "epoch": 0.11638217690090889,
+      "grad_norm": 0.0803595781326294,
+      "learning_rate": 0.00019995944883385196,
+      "loss": 0.8487,
+      "step": 525
+    },
+    {
+      "epoch": 0.1219241853247617,
+      "grad_norm": 0.11509452760219574,
+      "learning_rate": 0.0001999406226542682,
+      "loss": 0.7787,
+      "step": 550
+    },
+    {
+      "epoch": 0.1274661937486145,
+      "grad_norm": 0.07928384840488434,
+      "learning_rate": 0.00019991821889308987,
+      "loss": 0.8357,
+      "step": 575
+    },
+    {
+      "epoch": 0.1330082021724673,
+      "grad_norm": 0.09423446655273438,
+      "learning_rate": 0.00019989223835230606,
+      "loss": 0.7564,
+      "step": 600
+    },
+    {
+      "epoch": 0.1385502105963201,
+      "grad_norm": 0.0835939422249794,
+      "learning_rate": 0.000199862681961944,
+      "loss": 0.8568,
+      "step": 625
+    },
+    {
+      "epoch": 0.1440922190201729,
+      "grad_norm": 0.09292898327112198,
+      "learning_rate": 0.0001998295507800359,
+      "loss": 0.7612,
+      "step": 650
+    },
+    {
+      "epoch": 0.1496342274440257,
+      "grad_norm": 0.07704215496778488,
+      "learning_rate": 0.00019979284599258107,
+      "loss": 0.8263,
+      "step": 675
+    },
+    {
+      "epoch": 0.15517623586787851,
+      "grad_norm": 0.10980474948883057,
+      "learning_rate": 0.0001997525689135034,
+      "loss": 0.7677,
+      "step": 700
+    },
+    {
+      "epoch": 0.16071824429173132,
+      "grad_norm": 0.08016064018011093,
+      "learning_rate": 0.0001997087209846043,
+      "loss": 0.8344,
+      "step": 725
+    },
+    {
+      "epoch": 0.16626025271558412,
+      "grad_norm": 0.0950881615281105,
+      "learning_rate": 0.0001996613037755113,
+      "loss": 0.769,
+      "step": 750
+    },
+    {
+      "epoch": 0.17180226113943692,
+      "grad_norm": 0.07932984828948975,
+      "learning_rate": 0.00019961031898362152,
+      "loss": 0.8156,
+      "step": 775
+    },
+    {
+      "epoch": 0.17734426956328975,
+      "grad_norm": 0.09336528927087784,
+      "learning_rate": 0.00019955576843404128,
+      "loss": 0.7767,
+      "step": 800
+    },
+    {
+      "epoch": 0.18288627798714255,
+      "grad_norm": 0.08560346812009811,
+      "learning_rate": 0.00019949765407952042,
+      "loss": 0.8228,
+      "step": 825
+    },
+    {
+      "epoch": 0.18842828641099535,
+      "grad_norm": 0.08475169539451599,
+      "learning_rate": 0.00019943597800038267,
+      "loss": 0.7669,
+      "step": 850
+    },
+    {
+      "epoch": 0.19397029483484815,
+      "grad_norm": 0.09038034081459045,
+      "learning_rate": 0.00019937074240445105,
+      "loss": 0.8182,
+      "step": 875
+    },
+    {
+      "epoch": 0.19951230325870095,
+      "grad_norm": 0.09195873886346817,
+      "learning_rate": 0.0001993019496269688,
+      "loss": 0.7598,
+      "step": 900
+    },
+    {
+      "epoch": 0.20505431168255375,
+      "grad_norm": 0.08655796200037003,
+      "learning_rate": 0.0001992296021305159,
+      "loss": 0.8167,
+      "step": 925
+    },
+    {
+      "epoch": 0.21059632010640655,
+      "grad_norm": 0.08353498578071594,
+      "learning_rate": 0.00019915370250492084,
+      "loss": 0.7486,
+      "step": 950
+    },
+    {
+      "epoch": 0.21613832853025935,
+      "grad_norm": 0.09225723147392273,
+      "learning_rate": 0.0001990742534671679,
+      "loss": 0.8138,
+      "step": 975
+    },
+    {
+      "epoch": 0.22168033695411218,
+      "grad_norm": 0.12104763090610504,
+      "learning_rate": 0.00019899125786129997,
+      "loss": 0.7153,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 13533,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.78733273181696e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a13672a0401c8d0a53efc112f53ebd65f36fa9003e87b1710879c58d881a1e1
+size 5051

checkpoint-10000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-13b-chat-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

checkpoint-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+  "bias": "lora_only",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.001,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-10000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65f8840cbbaa700a0942555614b55f1801df615f7179aad0ab8fb0118ea4e486
+size 209736952

checkpoint-10000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8278c49ac2607b3f2bc56bf4fb612b22386249dc19b8a5af3f0b77c326945004
+size 419529285

checkpoint-10000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fff43d80abc63a3828242eaac15ac1d7e14d669caebfbf67ac745dab3c866d81
+size 14575

checkpoint-10000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5baba50c9446306722759cf46e5fbc1991fffa1d9047464636a8aa263559559
+size 627

checkpoint-10000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-10000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-10000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-10000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-10000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2833 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.216803369541122,
+  "eval_steps": 500,
+  "global_step": 10000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005542008423852805,
+      "grad_norm": 0.07243233174085617,
+      "learning_rate": 1.2315270935960592e-05,
+      "loss": 1.4594,
+      "step": 25
+    },
+    {
+      "epoch": 0.01108401684770561,
+      "grad_norm": 0.40484485030174255,
+      "learning_rate": 2.4630541871921184e-05,
+      "loss": 2.2032,
+      "step": 50
+    },
+    {
+      "epoch": 0.016626025271558414,
+      "grad_norm": 0.06850667297840118,
+      "learning_rate": 3.694581280788178e-05,
+      "loss": 1.2931,
+      "step": 75
+    },
+    {
+      "epoch": 0.02216803369541122,
+      "grad_norm": 0.4395073354244232,
+      "learning_rate": 4.926108374384237e-05,
+      "loss": 1.5698,
+      "step": 100
+    },
+    {
+      "epoch": 0.027710042119264023,
+      "grad_norm": 0.077068030834198,
+      "learning_rate": 6.157635467980296e-05,
+      "loss": 1.0537,
+      "step": 125
+    },
+    {
+      "epoch": 0.03325205054311683,
+      "grad_norm": 0.3282291293144226,
+      "learning_rate": 7.389162561576355e-05,
+      "loss": 0.9749,
+      "step": 150
+    },
+    {
+      "epoch": 0.03879405896696963,
+      "grad_norm": 0.0593000203371048,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 0.9349,
+      "step": 175
+    },
+    {
+      "epoch": 0.04433606739082244,
+      "grad_norm": 0.25612473487854004,
+      "learning_rate": 9.852216748768474e-05,
+      "loss": 0.8974,
+      "step": 200
+    },
+    {
+      "epoch": 0.04987807581467524,
+      "grad_norm": 0.0757347121834755,
+      "learning_rate": 0.00011083743842364534,
+      "loss": 0.9081,
+      "step": 225
+    },
+    {
+      "epoch": 0.055420084238528046,
+      "grad_norm": 0.14145499467849731,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 0.8607,
+      "step": 250
+    },
+    {
+      "epoch": 0.06096209266238085,
+      "grad_norm": 0.07710155844688416,
+      "learning_rate": 0.00013546798029556652,
+      "loss": 0.8973,
+      "step": 275
+    },
+    {
+      "epoch": 0.06650410108623366,
+      "grad_norm": 0.14791467785835266,
+      "learning_rate": 0.0001477832512315271,
+      "loss": 0.7924,
+      "step": 300
+    },
+    {
+      "epoch": 0.07204610951008646,
+      "grad_norm": 0.07742594182491302,
+      "learning_rate": 0.00016009852216748767,
+      "loss": 0.8698,
+      "step": 325
+    },
+    {
+      "epoch": 0.07758811793393926,
+      "grad_norm": 0.14303487539291382,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.786,
+      "step": 350
+    },
+    {
+      "epoch": 0.08313012635779206,
+      "grad_norm": 0.0865108072757721,
+      "learning_rate": 0.00018472906403940888,
+      "loss": 0.8606,
+      "step": 375
+    },
+    {
+      "epoch": 0.08867213478164487,
+      "grad_norm": 0.7533164024353027,
+      "learning_rate": 0.00019704433497536947,
+      "loss": 0.807,
+      "step": 400
+    },
+    {
+      "epoch": 0.09421414320549767,
+      "grad_norm": 0.08325570821762085,
+      "learning_rate": 0.00019999896617927833,
+      "loss": 0.8635,
+      "step": 425
+    },
+    {
+      "epoch": 0.09975615162935048,
+      "grad_norm": 0.1043543666601181,
+      "learning_rate": 0.0001999944557842899,
+      "loss": 0.7825,
+      "step": 450
+    },
+    {
+      "epoch": 0.10529816005320328,
+      "grad_norm": 0.07949995994567871,
+      "learning_rate": 0.0001999863658806385,
+      "loss": 0.8379,
+      "step": 475
+    },
+    {
+      "epoch": 0.11084016847705609,
+      "grad_norm": 0.12020070850849152,
+      "learning_rate": 0.00019997469675791905,
+      "loss": 0.768,
+      "step": 500
+    },
+    {
+      "epoch": 0.11638217690090889,
+      "grad_norm": 0.0803595781326294,
+      "learning_rate": 0.00019995944883385196,
+      "loss": 0.8487,
+      "step": 525
+    },
+    {
+      "epoch": 0.1219241853247617,
+      "grad_norm": 0.11509452760219574,
+      "learning_rate": 0.0001999406226542682,
+      "loss": 0.7787,
+      "step": 550
+    },
+    {
+      "epoch": 0.1274661937486145,
+      "grad_norm": 0.07928384840488434,
+      "learning_rate": 0.00019991821889308987,
+      "loss": 0.8357,
+      "step": 575
+    },
+    {
+      "epoch": 0.1330082021724673,
+      "grad_norm": 0.09423446655273438,
+      "learning_rate": 0.00019989223835230606,
+      "loss": 0.7564,
+      "step": 600
+    },
+    {
+      "epoch": 0.1385502105963201,
+      "grad_norm": 0.0835939422249794,
+      "learning_rate": 0.000199862681961944,
+      "loss": 0.8568,
+      "step": 625
+    },
+    {
+      "epoch": 0.1440922190201729,
+      "grad_norm": 0.09292898327112198,
+      "learning_rate": 0.0001998295507800359,
+      "loss": 0.7612,
+      "step": 650
+    },
+    {
+      "epoch": 0.1496342274440257,
+      "grad_norm": 0.07704215496778488,
+      "learning_rate": 0.00019979284599258107,
+      "loss": 0.8263,
+      "step": 675
+    },
+    {
+      "epoch": 0.15517623586787851,
+      "grad_norm": 0.10980474948883057,
+      "learning_rate": 0.0001997525689135034,
+      "loss": 0.7677,
+      "step": 700
+    },
+    {
+      "epoch": 0.16071824429173132,
+      "grad_norm": 0.08016064018011093,
+      "learning_rate": 0.0001997087209846043,
+      "loss": 0.8344,
+      "step": 725
+    },
+    {
+      "epoch": 0.16626025271558412,
+      "grad_norm": 0.0950881615281105,
+      "learning_rate": 0.0001996613037755113,
+      "loss": 0.769,
+      "step": 750
+    },
+    {
+      "epoch": 0.17180226113943692,
+      "grad_norm": 0.07932984828948975,
+      "learning_rate": 0.00019961031898362152,
+      "loss": 0.8156,
+      "step": 775
+    },
+    {
+      "epoch": 0.17734426956328975,
+      "grad_norm": 0.09336528927087784,
+      "learning_rate": 0.00019955576843404128,
+      "loss": 0.7767,
+      "step": 800
+    },
+    {
+      "epoch": 0.18288627798714255,
+      "grad_norm": 0.08560346812009811,
+      "learning_rate": 0.00019949765407952042,
+      "loss": 0.8228,
+      "step": 825
+    },
+    {
+      "epoch": 0.18842828641099535,
+      "grad_norm": 0.08475169539451599,
+      "learning_rate": 0.00019943597800038267,
+      "loss": 0.7669,
+      "step": 850
+    },
+    {
+      "epoch": 0.19397029483484815,
+      "grad_norm": 0.09038034081459045,
+      "learning_rate": 0.00019937074240445105,
+      "loss": 0.8182,
+      "step": 875
+    },
+    {
+      "epoch": 0.19951230325870095,
+      "grad_norm": 0.09195873886346817,
+      "learning_rate": 0.0001993019496269688,
+      "loss": 0.7598,
+      "step": 900
+    },
+    {
+      "epoch": 0.20505431168255375,
+      "grad_norm": 0.08655796200037003,
+      "learning_rate": 0.0001992296021305159,
+      "loss": 0.8167,
+      "step": 925
+    },
+    {
+      "epoch": 0.21059632010640655,
+      "grad_norm": 0.08353498578071594,
+      "learning_rate": 0.00019915370250492084,
+      "loss": 0.7486,
+      "step": 950
+    },
+    {
+      "epoch": 0.21613832853025935,
+      "grad_norm": 0.09225723147392273,
+      "learning_rate": 0.0001990742534671679,
+      "loss": 0.8138,
+      "step": 975
+    },
+    {
+      "epoch": 0.22168033695411218,
+      "grad_norm": 0.12104763090610504,
+      "learning_rate": 0.00019899125786129997,
+      "loss": 0.7153,
+      "step": 1000
+    },
+    {
+      "epoch": 0.22722234537796498,
+      "grad_norm": 0.0815986767411232,
+      "learning_rate": 0.00019890471865831669,
+      "loss": 0.7983,
+      "step": 1025
+    },
+    {
+      "epoch": 0.23276435380181779,
+      "grad_norm": 0.08845670521259308,
+      "learning_rate": 0.00019881463895606805,
+      "loss": 0.7187,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2383063622256706,
+      "grad_norm": 0.0821809321641922,
+      "learning_rate": 0.00019872102197914359,
+      "loss": 0.804,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2438483706495234,
+      "grad_norm": 0.08711609989404678,
+      "learning_rate": 0.00019862387107875688,
+      "loss": 0.7795,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2493903790733762,
+      "grad_norm": 0.08517508953809738,
+      "learning_rate": 0.00019852318973262567,
+      "loss": 0.7937,
+      "step": 1125
+    },
+    {
+      "epoch": 0.254932387497229,
+      "grad_norm": 0.10830071568489075,
+      "learning_rate": 0.00019841898154484726,
+      "loss": 0.7458,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2604743959210818,
+      "grad_norm": 0.08541836589574814,
+      "learning_rate": 0.0001983112502457696,
+      "loss": 0.8131,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2660164043449346,
+      "grad_norm": 0.08794037252664566,
+      "learning_rate": 0.00019819999969185762,
+      "loss": 0.7577,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2715584127687874,
+      "grad_norm": 0.08078176528215408,
+      "learning_rate": 0.00019808523386555542,
+      "loss": 0.812,
+      "step": 1225
+    },
+    {
+      "epoch": 0.2771004211926402,
+      "grad_norm": 0.09263130277395248,
+      "learning_rate": 0.0001979669568751434,
+      "loss": 0.7582,
+      "step": 1250
+    },
+    {
+      "epoch": 0.282642429616493,
+      "grad_norm": 0.08198932558298111,
+      "learning_rate": 0.00019784517295459147,
+      "loss": 0.7958,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2881844380403458,
+      "grad_norm": 0.07858102023601532,
+      "learning_rate": 0.00019771988646340725,
+      "loss": 0.7744,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2937264464641986,
+      "grad_norm": 0.0851408839225769,
+      "learning_rate": 0.00019759110188648026,
+      "loss": 0.7913,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2992684548880514,
+      "grad_norm": 0.09252189099788666,
+      "learning_rate": 0.00019745882383392116,
+      "loss": 0.7675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.30481046331190426,
+      "grad_norm": 0.08306555449962616,
+      "learning_rate": 0.0001973230570408968,
+      "loss": 0.8059,
+      "step": 1375
+    },
+    {
+      "epoch": 0.31035247173575703,
+      "grad_norm": 0.0797729641199112,
+      "learning_rate": 0.0001971838063674608,
+      "loss": 0.7424,
+      "step": 1400
+    },
+    {
+      "epoch": 0.31589448015960986,
+      "grad_norm": 0.08266165107488632,
+      "learning_rate": 0.0001970410767983794,
+      "loss": 0.7847,
+      "step": 1425
+    },
+    {
+      "epoch": 0.32143648858346263,
+      "grad_norm": 0.09364205598831177,
+      "learning_rate": 0.00019689487344295322,
+      "loss": 0.6924,
+      "step": 1450
+    },
+    {
+      "epoch": 0.32697849700731546,
+      "grad_norm": 0.08461842685937881,
+      "learning_rate": 0.00019674520153483414,
+      "loss": 0.8007,
+      "step": 1475
+    },
+    {
+      "epoch": 0.33252050543116823,
+      "grad_norm": 0.0840207040309906,
+      "learning_rate": 0.00019659206643183813,
+      "loss": 0.7139,
+      "step": 1500
+    },
+    {
+      "epoch": 0.33806251385502106,
+      "grad_norm": 0.08344192802906036,
+      "learning_rate": 0.00019643547361575343,
+      "loss": 0.7982,
+      "step": 1525
+    },
+    {
+      "epoch": 0.34360452227887384,
+      "grad_norm": 0.07934779673814774,
+      "learning_rate": 0.0001962754286921442,
+      "loss": 0.7164,
+      "step": 1550
+    },
+    {
+      "epoch": 0.34914653070272667,
+      "grad_norm": 0.08716201782226562,
+      "learning_rate": 0.00019611193739015,
+      "loss": 0.7846,
+      "step": 1575
+    },
+    {
+      "epoch": 0.3546885391265795,
+      "grad_norm": 0.08384064584970474,
+      "learning_rate": 0.0001959450055622806,
+      "loss": 0.7416,
+      "step": 1600
+    },
+    {
+      "epoch": 0.36023054755043227,
+      "grad_norm": 0.08661937713623047,
+      "learning_rate": 0.0001957746391842066,
+      "loss": 0.8075,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3657725559742851,
+      "grad_norm": 0.09327207505702972,
+      "learning_rate": 0.00019560084435454536,
+      "loss": 0.7596,
+      "step": 1650
+    },
+    {
+      "epoch": 0.37131456439813787,
+      "grad_norm": 0.08391096442937851,
+      "learning_rate": 0.00019542362729464273,
+      "loss": 0.7794,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3768565728219907,
+      "grad_norm": 0.07694080471992493,
+      "learning_rate": 0.00019524299434835052,
+      "loss": 0.7424,
+      "step": 1700
+    },
+    {
+      "epoch": 0.38239858124584347,
+      "grad_norm": 0.08567491173744202,
+      "learning_rate": 0.00019505895198179912,
+      "loss": 0.7996,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3879405896696963,
+      "grad_norm": 0.08828684687614441,
+      "learning_rate": 0.0001948715067831663,
+      "loss": 0.7394,
+      "step": 1750
+    },
+    {
+      "epoch": 0.39348259809354913,
+      "grad_norm": 0.08347714692354202,
+      "learning_rate": 0.00019468066546244117,
+      "loss": 0.7734,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3990246065174019,
+      "grad_norm": 0.07736373692750931,
+      "learning_rate": 0.00019448643485118412,
+      "loss": 0.7134,
+      "step": 1800
+    },
+    {
+      "epoch": 0.40456661494125473,
+      "grad_norm": 0.0840897262096405,
+      "learning_rate": 0.00019428882190228216,
+      "loss": 0.787,
+      "step": 1825
+    },
+    {
+      "epoch": 0.4101086233651075,
+      "grad_norm": 0.08665871620178223,
+      "learning_rate": 0.0001940878336897001,
+      "loss": 0.7151,
+      "step": 1850
+    },
+    {
+      "epoch": 0.41565063178896033,
+      "grad_norm": 0.08358912914991379,
+      "learning_rate": 0.0001938834774082274,
+      "loss": 0.7982,
+      "step": 1875
+    },
+    {
+      "epoch": 0.4211926402128131,
+      "grad_norm": 0.07928963005542755,
+      "learning_rate": 0.0001936757603732203,
+      "loss": 0.7195,
+      "step": 1900
+    },
+    {
+      "epoch": 0.42673464863666594,
+      "grad_norm": 0.08886470645666122,
+      "learning_rate": 0.00019346469002034042,
+      "loss": 0.7762,
+      "step": 1925
+    },
+    {
+      "epoch": 0.4322766570605187,
+      "grad_norm": 0.1071886494755745,
+      "learning_rate": 0.00019325027390528822,
+      "loss": 0.7453,
+      "step": 1950
+    },
+    {
+      "epoch": 0.43781866548437154,
+      "grad_norm": 0.08474262803792953,
+      "learning_rate": 0.00019303251970353261,
+      "loss": 0.7839,
+      "step": 1975
+    },
+    {
+      "epoch": 0.44336067390822437,
+      "grad_norm": 0.08803894370794296,
+      "learning_rate": 0.0001928114352100363,
+      "loss": 0.7171,
+      "step": 2000
+    },
+    {
+      "epoch": 0.44890268233207714,
+      "grad_norm": 0.08429575711488724,
+      "learning_rate": 0.00019258702833897665,
+      "loss": 0.7781,
+      "step": 2025
+    },
+    {
+      "epoch": 0.45444469075592997,
+      "grad_norm": 0.08510231226682663,
+      "learning_rate": 0.00019235930712346248,
+      "loss": 0.6949,
+      "step": 2050
+    },
+    {
+      "epoch": 0.45998669917978274,
+      "grad_norm": 0.08167176693677902,
+      "learning_rate": 0.00019212827971524634,
+      "loss": 0.7722,
+      "step": 2075
+    },
+    {
+      "epoch": 0.46552870760363557,
+      "grad_norm": 0.06542418897151947,
+      "learning_rate": 0.00019189395438443278,
+      "loss": 0.7203,
+      "step": 2100
+    },
+    {
+      "epoch": 0.47107071602748835,
+      "grad_norm": 0.08293402194976807,
+      "learning_rate": 0.00019165633951918247,
+      "loss": 0.7735,
+      "step": 2125
+    },
+    {
+      "epoch": 0.4766127244513412,
+      "grad_norm": 0.0809284895658493,
+      "learning_rate": 0.00019141544362541162,
+      "loss": 0.7412,
+      "step": 2150
+    },
+    {
+      "epoch": 0.48215473287519395,
+      "grad_norm": 0.08212891221046448,
+      "learning_rate": 0.00019117127532648773,
+      "loss": 0.7629,
+      "step": 2175
+    },
+    {
+      "epoch": 0.4876967412990468,
+      "grad_norm": 0.08602219074964523,
+      "learning_rate": 0.0001909238433629208,
+      "loss": 0.6935,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4932387497228996,
+      "grad_norm": 0.08529417216777802,
+      "learning_rate": 0.0001906731565920505,
+      "loss": 0.7915,
+      "step": 2225
+    },
+    {
+      "epoch": 0.4987807581467524,
+      "grad_norm": 0.08774964511394501,
+      "learning_rate": 0.00019041922398772897,
+      "loss": 0.7359,
+      "step": 2250
+    },
+    {
+      "epoch": 0.5043227665706052,
+      "grad_norm": 0.08649475872516632,
+      "learning_rate": 0.00019016205463999984,
+      "loss": 0.7696,
+      "step": 2275
+    },
+    {
+      "epoch": 0.509864774994458,
+      "grad_norm": 0.0878506749868393,
+      "learning_rate": 0.00018990165775477252,
+      "loss": 0.7365,
+      "step": 2300
+    },
+    {
+      "epoch": 0.5154067834183108,
+      "grad_norm": 0.09131711721420288,
+      "learning_rate": 0.0001896380426534929,
+      "loss": 0.7809,
+      "step": 2325
+    },
+    {
+      "epoch": 0.5209487918421636,
+      "grad_norm": 0.07379825413227081,
+      "learning_rate": 0.00018937121877280957,
+      "loss": 0.7029,
+      "step": 2350
+    },
+    {
+      "epoch": 0.5264908002660164,
+      "grad_norm": 0.08535836637020111,
+      "learning_rate": 0.00018910119566423598,
+      "loss": 0.7679,
+      "step": 2375
+    },
+    {
+      "epoch": 0.5320328086898692,
+      "grad_norm": 0.06719771772623062,
+      "learning_rate": 0.00018882798299380864,
+      "loss": 0.7121,
+      "step": 2400
+    },
+    {
+      "epoch": 0.537574817113722,
+      "grad_norm": 0.09019796550273895,
+      "learning_rate": 0.00018855159054174093,
+      "loss": 0.7754,
+      "step": 2425
+    },
+    {
+      "epoch": 0.5431168255375748,
+      "grad_norm": 0.08144286274909973,
+      "learning_rate": 0.0001882720282020732,
+      "loss": 0.7255,
+      "step": 2450
+    },
+    {
+      "epoch": 0.5486588339614277,
+      "grad_norm": 0.08412271738052368,
+      "learning_rate": 0.0001879893059823185,
+      "loss": 0.7722,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5542008423852804,
+      "grad_norm": 0.09016039222478867,
+      "learning_rate": 0.0001877034340031042,
+      "loss": 0.7275,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5597428508091332,
+      "grad_norm": 0.08850298821926117,
+      "learning_rate": 0.00018741442249781,
+      "loss": 0.7828,
+      "step": 2525
+    },
+    {
+      "epoch": 0.565284859232986,
+      "grad_norm": 0.06989564746618271,
+      "learning_rate": 0.00018712228181220128,
+      "loss": 0.7111,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5708268676568389,
+      "grad_norm": 0.09214618802070618,
+      "learning_rate": 0.00018682702240405906,
+      "loss": 0.7752,
+      "step": 2575
+    },
+    {
+      "epoch": 0.5763688760806917,
+      "grad_norm": 0.07766986638307571,
+      "learning_rate": 0.0001865286548428054,
+      "loss": 0.7108,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5819108845045444,
+      "grad_norm": 0.07919591665267944,
+      "learning_rate": 0.00018622718980912514,
+      "loss": 0.775,
+      "step": 2625
+    },
+    {
+      "epoch": 0.5874528929283972,
+      "grad_norm": 0.07524783164262772,
+      "learning_rate": 0.00018592263809458361,
+      "loss": 0.6941,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5929949013522501,
+      "grad_norm": 0.08549198508262634,
+      "learning_rate": 0.00018561501060124024,
+      "loss": 0.7718,
+      "step": 2675
+    },
+    {
+      "epoch": 0.5985369097761029,
+      "grad_norm": 0.08182788640260696,
+      "learning_rate": 0.0001853043183412584,
+      "loss": 0.7072,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6040789181999556,
+      "grad_norm": 0.084741972386837,
+      "learning_rate": 0.00018499057243651096,
+      "loss": 0.7478,
+      "step": 2725
+    },
+    {
+      "epoch": 0.6096209266238085,
+      "grad_norm": 0.06824459880590439,
+      "learning_rate": 0.0001846737841181825,
+      "loss": 0.7238,
+      "step": 2750
+    },
+    {
+      "epoch": 0.6151629350476613,
+      "grad_norm": 0.08315033465623856,
+      "learning_rate": 0.00018435396472636704,
+      "loss": 0.7597,
+      "step": 2775
+    },
+    {
+      "epoch": 0.6207049434715141,
+      "grad_norm": 0.07116558402776718,
+      "learning_rate": 0.00018403112570966216,
+      "loss": 0.7096,
+      "step": 2800
+    },
+    {
+      "epoch": 0.6262469518953668,
+      "grad_norm": 0.08500215411186218,
+      "learning_rate": 0.00018370527862475916,
+      "loss": 0.756,
+      "step": 2825
+    },
+    {
+      "epoch": 0.6317889603192197,
+      "grad_norm": 0.07979004830121994,
+      "learning_rate": 0.00018337643513602933,
+      "loss": 0.6886,
+      "step": 2850
+    },
+    {
+      "epoch": 0.6373309687430725,
+      "grad_norm": 0.08140358328819275,
+      "learning_rate": 0.00018304460701510652,
+      "loss": 0.7648,
+      "step": 2875
+    },
+    {
+      "epoch": 0.6428729771669253,
+      "grad_norm": 0.07779423147439957,
+      "learning_rate": 0.0001827098061404656,
+      "loss": 0.7222,
+      "step": 2900
+    },
+    {
+      "epoch": 0.6484149855907781,
+      "grad_norm": 0.08853591978549957,
+      "learning_rate": 0.0001823720444969974,
+      "loss": 0.7736,
+      "step": 2925
+    },
+    {
+      "epoch": 0.6539569940146309,
+      "grad_norm": 0.07350102066993713,
+      "learning_rate": 0.0001820313341755795,
+      "loss": 0.7256,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6594990024384837,
+      "grad_norm": 0.08152145147323608,
+      "learning_rate": 0.0001816876873726436,
+      "loss": 0.7598,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6650410108623365,
+      "grad_norm": 0.08045897632837296,
+      "learning_rate": 0.00018134111638973876,
+      "loss": 0.7275,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6705830192861894,
+      "grad_norm": 0.08514434099197388,
+      "learning_rate": 0.00018099163363309123,
+      "loss": 0.7688,
+      "step": 3025
+    },
+    {
+      "epoch": 0.6761250277100421,
+      "grad_norm": 0.060850344598293304,
+      "learning_rate": 0.00018063925161316012,
+      "loss": 0.7019,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6816670361338949,
+      "grad_norm": 0.08471492677927017,
+      "learning_rate": 0.00018028398294418977,
+      "loss": 0.7573,
+      "step": 3075
+    },
+    {
+      "epoch": 0.6872090445577477,
+      "grad_norm": 0.0642291009426117,
+      "learning_rate": 0.00017992584034375798,
+      "loss": 0.7108,
+      "step": 3100
+    },
+    {
+      "epoch": 0.6927510529816006,
+      "grad_norm": 0.09357668459415436,
+      "learning_rate": 0.000179564836632321,
+      "loss": 0.7478,
+      "step": 3125
+    },
+    {
+      "epoch": 0.6982930614054533,
+      "grad_norm": 0.07198700308799744,
+      "learning_rate": 0.00017920098473275445,
+      "loss": 0.6973,
+      "step": 3150
+    },
+    {
+      "epoch": 0.7038350698293061,
+      "grad_norm": 0.08420095592737198,
+      "learning_rate": 0.00017883429766989064,
+      "loss": 0.7487,
+      "step": 3175
+    },
+    {
+      "epoch": 0.709377078253159,
+      "grad_norm": 0.06639819592237473,
+      "learning_rate": 0.00017846478857005255,
+      "loss": 0.6741,
+      "step": 3200
+    },
+    {
+      "epoch": 0.7149190866770118,
+      "grad_norm": 0.08200914412736893,
+      "learning_rate": 0.00017809247066058378,
+      "loss": 0.7526,
+      "step": 3225
+    },
+    {
+      "epoch": 0.7204610951008645,
+      "grad_norm": 0.07311141490936279,
+      "learning_rate": 0.0001777173572693751,
+      "loss": 0.677,
+      "step": 3250
+    },
+    {
+      "epoch": 0.7260031035247173,
+      "grad_norm": 0.08722089231014252,
+      "learning_rate": 0.00017733946182438726,
+      "loss": 0.7585,
+      "step": 3275
+    },
+    {
+      "epoch": 0.7315451119485702,
+      "grad_norm": 0.06589449942111969,
+      "learning_rate": 0.00017695879785317048,
+      "loss": 0.708,
+      "step": 3300
+    },
+    {
+      "epoch": 0.737087120372423,
+      "grad_norm": 0.08262074738740921,
+      "learning_rate": 0.0001765753789823801,
+      "loss": 0.749,
+      "step": 3325
+    },
+    {
+      "epoch": 0.7426291287962757,
+      "grad_norm": 0.07514823973178864,
+      "learning_rate": 0.00017618921893728867,
+      "loss": 0.6918,
+      "step": 3350
+    },
+    {
+      "epoch": 0.7481711372201286,
+      "grad_norm": 0.08757175505161285,
+      "learning_rate": 0.00017580033154129503,
+      "loss": 0.7445,
+      "step": 3375
+    },
+    {
+      "epoch": 0.7537131456439814,
+      "grad_norm": 0.0716458335518837,
+      "learning_rate": 0.0001754087307154289,
+      "loss": 0.7122,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7592551540678342,
+      "grad_norm": 0.08453212678432465,
+      "learning_rate": 0.00017501443047785296,
+      "loss": 0.7656,
+      "step": 3425
+    },
+    {
+      "epoch": 0.7647971624916869,
+      "grad_norm": 0.06761575490236282,
+      "learning_rate": 0.00017461744494336098,
+      "loss": 0.6673,
+      "step": 3450
+    },
+    {
+      "epoch": 0.7703391709155398,
+      "grad_norm": 0.08577297627925873,
+      "learning_rate": 0.0001742177883228724,
+      "loss": 0.7494,
+      "step": 3475
+    },
+    {
+      "epoch": 0.7758811793393926,
+      "grad_norm": 0.05691730976104736,
+      "learning_rate": 0.00017381547492292376,
+      "loss": 0.6972,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7814231877632454,
+      "grad_norm": 0.09115194529294968,
+      "learning_rate": 0.00017341051914515656,
+      "loss": 0.7706,
+      "step": 3525
+    },
+    {
+      "epoch": 0.7869651961870983,
+      "grad_norm": 0.07214304804801941,
+      "learning_rate": 0.00017300293548580162,
+      "loss": 0.6807,
+      "step": 3550
+    },
+    {
+      "epoch": 0.792507204610951,
+      "grad_norm": 0.08448139578104019,
+      "learning_rate": 0.00017259273853516028,
+      "loss": 0.7661,
+      "step": 3575
+    },
+    {
+      "epoch": 0.7980492130348038,
+      "grad_norm": 0.08282499015331268,
+      "learning_rate": 0.00017217994297708195,
+      "loss": 0.7391,
+      "step": 3600
+    },
+    {
+      "epoch": 0.8035912214586566,
+      "grad_norm": 0.0804004818201065,
+      "learning_rate": 0.00017176456358843875,
+      "loss": 0.7402,
+      "step": 3625
+    },
+    {
+      "epoch": 0.8091332298825095,
+      "grad_norm": 0.07265755534172058,
+      "learning_rate": 0.00017134661523859622,
+      "loss": 0.7019,
+      "step": 3650
+    },
+    {
+      "epoch": 0.8146752383063622,
+      "grad_norm": 0.08803457766771317,
+      "learning_rate": 0.00017092611288888125,
+      "loss": 0.7572,
+      "step": 3675
+    },
+    {
+      "epoch": 0.820217246730215,
+      "grad_norm": 0.0652441680431366,
+      "learning_rate": 0.0001705030715920464,
+      "loss": 0.706,
+      "step": 3700
+    },
+    {
+      "epoch": 0.8257592551540678,
+      "grad_norm": 0.08185753971338272,
+      "learning_rate": 0.0001700775064917312,
+      "loss": 0.764,
+      "step": 3725
+    },
+    {
+      "epoch": 0.8313012635779207,
+      "grad_norm": 0.0859500914812088,
+      "learning_rate": 0.00016964943282191984,
+      "loss": 0.6927,
+      "step": 3750
+    },
+    {
+      "epoch": 0.8368432720017734,
+      "grad_norm": 0.09176376461982727,
+      "learning_rate": 0.00016921886590639602,
+      "loss": 0.7567,
+      "step": 3775
+    },
+    {
+      "epoch": 0.8423852804256262,
+      "grad_norm": 0.0646485984325409,
+      "learning_rate": 0.0001687858211581943,
+      "loss": 0.6848,
+      "step": 3800
+    },
+    {
+      "epoch": 0.8479272888494791,
+      "grad_norm": 0.08545655012130737,
+      "learning_rate": 0.00016835031407904839,
+      "loss": 0.7546,
+      "step": 3825
+    },
+    {
+      "epoch": 0.8534692972733319,
+      "grad_norm": 0.06338818371295929,
+      "learning_rate": 0.00016791236025883626,
+      "loss": 0.6655,
+      "step": 3850
+    },
+    {
+      "epoch": 0.8590113056971846,
+      "grad_norm": 0.08781229704618454,
+      "learning_rate": 0.00016747197537502205,
+      "loss": 0.7441,
+      "step": 3875
+    },
+    {
+      "epoch": 0.8645533141210374,
+      "grad_norm": 0.06220358610153198,
+      "learning_rate": 0.00016702917519209487,
+      "loss": 0.6795,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8700953225448903,
+      "grad_norm": 0.08917712420225143,
+      "learning_rate": 0.0001665839755610044,
+      "loss": 0.7552,
+      "step": 3925
+    },
+    {
+      "epoch": 0.8756373309687431,
+      "grad_norm": 0.06624036282300949,
+      "learning_rate": 0.00016613639241859355,
+      "loss": 0.6632,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8811793393925959,
+      "grad_norm": 0.08898719400167465,
+      "learning_rate": 0.00016568644178702803,
+      "loss": 0.757,
+      "step": 3975
+    },
+    {
+      "epoch": 0.8867213478164487,
+      "grad_norm": 0.05095354840159416,
+      "learning_rate": 0.0001652341397732227,
+      "loss": 0.6992,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8922633562403015,
+      "grad_norm": 0.08842916786670685,
+      "learning_rate": 0.0001647795025682649,
+      "loss": 0.7504,
+      "step": 4025
+    },
+    {
+      "epoch": 0.8978053646641543,
+      "grad_norm": 0.0758206844329834,
+      "learning_rate": 0.00016432254644683516,
+      "loss": 0.7081,
+      "step": 4050
+    },
+    {
+      "epoch": 0.903347373088007,
+      "grad_norm": 0.0940496176481247,
+      "learning_rate": 0.0001638632877666243,
+      "loss": 0.746,
+      "step": 4075
+    },
+    {
+      "epoch": 0.9088893815118599,
+      "grad_norm": 0.06626766920089722,
+      "learning_rate": 0.00016340174296774804,
+      "loss": 0.6647,
+      "step": 4100
+    },
+    {
+      "epoch": 0.9144313899357127,
+      "grad_norm": 0.08919317275285721,
+      "learning_rate": 0.00016293792857215844,
+      "loss": 0.7516,
+      "step": 4125
+    },
+    {
+      "epoch": 0.9199733983595655,
+      "grad_norm": 0.06990760564804077,
+      "learning_rate": 0.00016247186118305252,
+      "loss": 0.7011,
+      "step": 4150
+    },
+    {
+      "epoch": 0.9255154067834183,
+      "grad_norm": 0.0870794802904129,
+      "learning_rate": 0.00016200355748427782,
+      "loss": 0.7529,
+      "step": 4175
+    },
+    {
+      "epoch": 0.9310574152072711,
+      "grad_norm": 0.06882854551076889,
+      "learning_rate": 0.00016153303423973526,
+      "loss": 0.7005,
+      "step": 4200
+    },
+    {
+      "epoch": 0.9365994236311239,
+      "grad_norm": 0.084992416203022,
+      "learning_rate": 0.0001610603082927789,
+      "loss": 0.7519,
+      "step": 4225
+    },
+    {
+      "epoch": 0.9421414320549767,
+      "grad_norm": 0.0638299211859703,
+      "learning_rate": 0.00016058539656561323,
+      "loss": 0.716,
+      "step": 4250
+    },
+    {
+      "epoch": 0.9476834404788296,
+      "grad_norm": 0.08899606764316559,
+      "learning_rate": 0.00016010831605868715,
+      "loss": 0.7257,
+      "step": 4275
+    },
+    {
+      "epoch": 0.9532254489026823,
+      "grad_norm": 0.06550378352403641,
+      "learning_rate": 0.00015962908385008565,
+      "loss": 0.7174,
+      "step": 4300
+    },
+    {
+      "epoch": 0.9587674573265351,
+      "grad_norm": 0.09001540392637253,
+      "learning_rate": 0.00015914771709491828,
+      "loss": 0.7271,
+      "step": 4325
+    },
+    {
+      "epoch": 0.9643094657503879,
+      "grad_norm": 0.06641615182161331,
+      "learning_rate": 0.000158664233024705,
+      "loss": 0.69,
+      "step": 4350
+    },
+    {
+      "epoch": 0.9698514741742408,
+      "grad_norm": 0.08917039632797241,
+      "learning_rate": 0.0001581786489467596,
+      "loss": 0.7483,
+      "step": 4375
+    },
+    {
+      "epoch": 0.9753934825980936,
+      "grad_norm": 0.05995697155594826,
+      "learning_rate": 0.00015769098224356992,
+      "loss": 0.7033,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9809354910219463,
+      "grad_norm": 0.08998765051364899,
+      "learning_rate": 0.00015720125037217572,
+      "loss": 0.7462,
+      "step": 4425
+    },
+    {
+      "epoch": 0.9864774994457992,
+      "grad_norm": 0.05868702754378319,
+      "learning_rate": 0.00015670947086354376,
+      "loss": 0.6654,
+      "step": 4450
+    },
+    {
+      "epoch": 0.992019507869652,
+      "grad_norm": 0.0880926102399826,
+      "learning_rate": 0.00015621566132194005,
+      "loss": 0.752,
+      "step": 4475
+    },
+    {
+      "epoch": 0.9975615162935048,
+      "grad_norm": 0.08538970351219177,
+      "learning_rate": 0.00015571983942430005,
+      "loss": 0.7338,
+      "step": 4500
+    },
+    {
+      "epoch": 1.0031035247173576,
+      "grad_norm": 0.0827050730586052,
+      "learning_rate": 0.0001552220229195956,
+      "loss": 0.7174,
+      "step": 4525
+    },
+    {
+      "epoch": 1.0086455331412103,
+      "grad_norm": 0.10867294669151306,
+      "learning_rate": 0.00015472222962819955,
+      "loss": 0.7637,
+      "step": 4550
+    },
+    {
+      "epoch": 1.0141875415650632,
+      "grad_norm": 0.08738269656896591,
+      "learning_rate": 0.00015422047744124802,
+      "loss": 0.6247,
+      "step": 4575
+    },
+    {
+      "epoch": 1.019729549988916,
+      "grad_norm": 0.12865987420082092,
+      "learning_rate": 0.0001537167843199998,
+      "loss": 0.7424,
+      "step": 4600
+    },
+    {
+      "epoch": 1.0252715584127687,
+      "grad_norm": 0.08619695156812668,
+      "learning_rate": 0.00015321116829519345,
+      "loss": 0.6461,
+      "step": 4625
+    },
+    {
+      "epoch": 1.0308135668366216,
+      "grad_norm": 0.11726492643356323,
+      "learning_rate": 0.0001527036474664019,
+      "loss": 0.7433,
+      "step": 4650
+    },
+    {
+      "epoch": 1.0363555752604743,
+      "grad_norm": 0.08198727667331696,
+      "learning_rate": 0.0001521942400013844,
+      "loss": 0.6086,
+      "step": 4675
+    },
+    {
+      "epoch": 1.0418975836843272,
+      "grad_norm": 0.11951526254415512,
+      "learning_rate": 0.00015168296413543635,
+      "loss": 0.7521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.04743959210818,
+      "grad_norm": 0.08714735507965088,
+      "learning_rate": 0.0001511698381707363,
+      "loss": 0.631,
+      "step": 4725
+    },
+    {
+      "epoch": 1.0529816005320327,
+      "grad_norm": 0.13869455456733704,
+      "learning_rate": 0.00015065488047569107,
+      "loss": 0.7524,
+      "step": 4750
+    },
+    {
+      "epoch": 1.0585236089558856,
+      "grad_norm": 0.08524268865585327,
+      "learning_rate": 0.00015013810948427794,
+      "loss": 0.6617,
+      "step": 4775
+    },
+    {
+      "epoch": 1.0640656173797385,
+      "grad_norm": 0.11017199605703354,
+      "learning_rate": 0.00014961954369538494,
+      "loss": 0.7598,
+      "step": 4800
+    },
+    {
+      "epoch": 1.0696076258035911,
+      "grad_norm": 0.0834374874830246,
+      "learning_rate": 0.00014909920167214858,
+      "loss": 0.627,
+      "step": 4825
+    },
+    {
+      "epoch": 1.075149634227444,
+      "grad_norm": 0.1357167363166809,
+      "learning_rate": 0.0001485771020412894,
+      "loss": 0.7466,
+      "step": 4850
+    },
+    {
+      "epoch": 1.080691642651297,
+      "grad_norm": 0.08910629153251648,
+      "learning_rate": 0.00014805326349244503,
+      "loss": 0.6238,
+      "step": 4875
+    },
+    {
+      "epoch": 1.0862336510751496,
+      "grad_norm": 0.10706546157598495,
+      "learning_rate": 0.00014752770477750144,
+      "loss": 0.7533,
+      "step": 4900
+    },
+    {
+      "epoch": 1.0917756594990025,
+      "grad_norm": 0.09201759845018387,
+      "learning_rate": 0.00014700044470992136,
+      "loss": 0.6521,
+      "step": 4925
+    },
+    {
+      "epoch": 1.0973176679228553,
+      "grad_norm": 0.14048361778259277,
+      "learning_rate": 0.00014647150216407106,
+      "loss": 0.7412,
+      "step": 4950
+    },
+    {
+      "epoch": 1.102859676346708,
+      "grad_norm": 0.08308299630880356,
+      "learning_rate": 0.00014594089607454454,
+      "loss": 0.6333,
+      "step": 4975
+    },
+    {
+      "epoch": 1.108401684770561,
+      "grad_norm": 0.12057497352361679,
+      "learning_rate": 0.00014540864543548582,
+      "loss": 0.7538,
+      "step": 5000
+    },
+    {
+      "epoch": 1.1139436931944136,
+      "grad_norm": 0.089565709233284,
+      "learning_rate": 0.00014487476929990898,
+      "loss": 0.6662,
+      "step": 5025
+    },
+    {
+      "epoch": 1.1194857016182664,
+      "grad_norm": 0.12125346809625626,
+      "learning_rate": 0.00014433928677901612,
+      "loss": 0.7653,
+      "step": 5050
+    },
+    {
+      "epoch": 1.1250277100421193,
+      "grad_norm": 0.08421044796705246,
+      "learning_rate": 0.00014380221704151318,
+      "loss": 0.615,
+      "step": 5075
+    },
+    {
+      "epoch": 1.130569718465972,
+      "grad_norm": 0.12215881794691086,
+      "learning_rate": 0.0001432635793129239,
+      "loss": 0.7482,
+      "step": 5100
+    },
+    {
+      "epoch": 1.1361117268898249,
+      "grad_norm": 0.08646813780069351,
+      "learning_rate": 0.0001427233928749014,
+      "loss": 0.6292,
+      "step": 5125
+    },
+    {
+      "epoch": 1.1416537353136778,
+      "grad_norm": 0.11372750997543335,
+      "learning_rate": 0.00014218167706453816,
+      "loss": 0.7487,
+      "step": 5150
+    },
+    {
+      "epoch": 1.1471957437375304,
+      "grad_norm": 0.08925063908100128,
+      "learning_rate": 0.00014163845127367362,
+      "loss": 0.6336,
+      "step": 5175
+    },
+    {
+      "epoch": 1.1527377521613833,
+      "grad_norm": 0.12316026538610458,
+      "learning_rate": 0.00014109373494820018,
+      "loss": 0.7566,
+      "step": 5200
+    },
+    {
+      "epoch": 1.1582797605852362,
+      "grad_norm": 0.08784055709838867,
+      "learning_rate": 0.00014054754758736698,
+      "loss": 0.6124,
+      "step": 5225
+    },
+    {
+      "epoch": 1.1638217690090888,
+      "grad_norm": 0.11267346143722534,
+      "learning_rate": 0.0001399999087430819,
+      "loss": 0.7611,
+      "step": 5250
+    },
+    {
+      "epoch": 1.1693637774329417,
+      "grad_norm": 0.08636374026536942,
+      "learning_rate": 0.00013945083801921167,
+      "loss": 0.6561,
+      "step": 5275
+    },
+    {
+      "epoch": 1.1749057858567946,
+      "grad_norm": 0.13902850449085236,
+      "learning_rate": 0.0001389003550708802,
+      "loss": 0.7546,
+      "step": 5300
+    },
+    {
+      "epoch": 1.1804477942806473,
+      "grad_norm": 0.09029743075370789,
+      "learning_rate": 0.0001383484796037648,
+      "loss": 0.6226,
+      "step": 5325
+    },
+    {
+      "epoch": 1.1859898027045002,
+      "grad_norm": 0.10377778112888336,
+      "learning_rate": 0.00013779523137339095,
+      "loss": 0.7422,
+      "step": 5350
+    },
+    {
+      "epoch": 1.1915318111283528,
+      "grad_norm": 0.08857985585927963,
+      "learning_rate": 0.00013724063018442494,
+      "loss": 0.6346,
+      "step": 5375
+    },
+    {
+      "epoch": 1.1970738195522057,
+      "grad_norm": 0.15107670426368713,
+      "learning_rate": 0.0001366846958899651,
+      "loss": 0.7266,
+      "step": 5400
+    },
+    {
+      "epoch": 1.2026158279760586,
+      "grad_norm": 0.09053236246109009,
+      "learning_rate": 0.000136127448390831,
+      "loss": 0.617,
+      "step": 5425
+    },
+    {
+      "epoch": 1.2081578363999113,
+      "grad_norm": 0.11061497032642365,
+      "learning_rate": 0.00013556890763485112,
+      "loss": 0.7631,
+      "step": 5450
+    },
+    {
+      "epoch": 1.2136998448237641,
+      "grad_norm": 0.08849512785673141,
+      "learning_rate": 0.0001350090936161487,
+      "loss": 0.5992,
+      "step": 5475
+    },
+    {
+      "epoch": 1.219241853247617,
+      "grad_norm": 0.12802088260650635,
+      "learning_rate": 0.00013444802637442606,
+      "loss": 0.7389,
+      "step": 5500
+    },
+    {
+      "epoch": 1.2247838616714697,
+      "grad_norm": 0.08612997084856033,
+      "learning_rate": 0.0001338857259942473,
+      "loss": 0.6173,
+      "step": 5525
+    },
+    {
+      "epoch": 1.2303258700953226,
+      "grad_norm": 0.1339733898639679,
+      "learning_rate": 0.0001333222126043192,
+      "loss": 0.7473,
+      "step": 5550
+    },
+    {
+      "epoch": 1.2358678785191755,
+      "grad_norm": 0.08974476903676987,
+      "learning_rate": 0.00013275750637677073,
+      "loss": 0.6224,
+      "step": 5575
+    },
+    {
+      "epoch": 1.2414098869430281,
+      "grad_norm": 0.1431790143251419,
+      "learning_rate": 0.00013219162752643103,
+      "loss": 0.748,
+      "step": 5600
+    },
+    {
+      "epoch": 1.246951895366881,
+      "grad_norm": 0.08545912057161331,
+      "learning_rate": 0.0001316245963101056,
+      "loss": 0.5969,
+      "step": 5625
+    },
+    {
+      "epoch": 1.2524939037907337,
+      "grad_norm": 0.13927248120307922,
+      "learning_rate": 0.00013105643302585137,
+      "loss": 0.752,
+      "step": 5650
+    },
+    {
+      "epoch": 1.2580359122145865,
+      "grad_norm": 0.08908078819513321,
+      "learning_rate": 0.0001304871580122499,
+      "loss": 0.6422,
+      "step": 5675
+    },
+    {
+      "epoch": 1.2635779206384394,
+      "grad_norm": 0.13006670773029327,
+      "learning_rate": 0.00012991679164767942,
+      "loss": 0.7594,
+      "step": 5700
+    },
+    {
+      "epoch": 1.269119929062292,
+      "grad_norm": 0.08754415810108185,
+      "learning_rate": 0.00012934535434958538,
+      "loss": 0.6121,
+      "step": 5725
+    },
+    {
+      "epoch": 1.274661937486145,
+      "grad_norm": 0.15051035583019257,
+      "learning_rate": 0.0001287728665737495,
+      "loss": 0.7414,
+      "step": 5750
+    },
+    {
+      "epoch": 1.2802039459099979,
+      "grad_norm": 0.08734755963087082,
+      "learning_rate": 0.00012819934881355745,
+      "loss": 0.6368,
+      "step": 5775
+    },
+    {
+      "epoch": 1.2857459543338505,
+      "grad_norm": 0.17122209072113037,
+      "learning_rate": 0.0001276248215992654,
+      "loss": 0.7535,
+      "step": 5800
+    },
+    {
+      "epoch": 1.2912879627577034,
+      "grad_norm": 0.08958180993795395,
+      "learning_rate": 0.00012704930549726503,
+      "loss": 0.5941,
+      "step": 5825
+    },
+    {
+      "epoch": 1.2968299711815563,
+      "grad_norm": 0.10746924579143524,
+      "learning_rate": 0.0001264728211093473,
+      "loss": 0.7498,
+      "step": 5850
+    },
+    {
+      "epoch": 1.302371979605409,
+      "grad_norm": 0.08696026355028152,
+      "learning_rate": 0.00012589538907196486,
+      "loss": 0.6451,
+      "step": 5875
+    },
+    {
+      "epoch": 1.3079139880292618,
+      "grad_norm": 0.14526303112506866,
+      "learning_rate": 0.0001253170300554936,
+      "loss": 0.7427,
+      "step": 5900
+    },
+    {
+      "epoch": 1.3134559964531145,
+      "grad_norm": 0.08736992627382278,
+      "learning_rate": 0.0001247377647634924,
+      "loss": 0.632,
+      "step": 5925
+    },
+    {
+      "epoch": 1.3189980048769674,
+      "grad_norm": 0.13362443447113037,
+      "learning_rate": 0.00012415761393196227,
+      "loss": 0.7394,
+      "step": 5950
+    },
+    {
+      "epoch": 1.3245400133008203,
+      "grad_norm": 0.09453903138637543,
+      "learning_rate": 0.00012357659832860386,
+      "loss": 0.6394,
+      "step": 5975
+    },
+    {
+      "epoch": 1.3300820217246732,
+      "grad_norm": 0.1354217529296875,
+      "learning_rate": 0.00012299473875207416,
+      "loss": 0.727,
+      "step": 6000
+    },
+    {
+      "epoch": 1.3356240301485258,
+      "grad_norm": 0.09107760339975357,
+      "learning_rate": 0.0001224120560312419,
+      "loss": 0.6197,
+      "step": 6025
+    },
+    {
+      "epoch": 1.3411660385723787,
+      "grad_norm": 0.12501803040504456,
+      "learning_rate": 0.00012182857102444203,
+      "loss": 0.7475,
+      "step": 6050
+    },
+    {
+      "epoch": 1.3467080469962314,
+      "grad_norm": 0.08888363093137741,
+      "learning_rate": 0.00012124430461872886,
+      "loss": 0.6108,
+      "step": 6075
+    },
+    {
+      "epoch": 1.3522500554200843,
+      "grad_norm": 0.16767624020576477,
+      "learning_rate": 0.00012065927772912863,
+      "loss": 0.7408,
+      "step": 6100
+    },
+    {
+      "epoch": 1.3577920638439371,
+      "grad_norm": 0.09112541377544403,
+      "learning_rate": 0.00012007351129789062,
+      "loss": 0.5868,
+      "step": 6125
+    },
+    {
+      "epoch": 1.3633340722677898,
+      "grad_norm": 0.13539327681064606,
+      "learning_rate": 0.0001194870262937375,
+      "loss": 0.7505,
+      "step": 6150
+    },
+    {
+      "epoch": 1.3688760806916427,
+      "grad_norm": 0.08977732807397842,
+      "learning_rate": 0.00011889984371111475,
+      "loss": 0.5985,
+      "step": 6175
+    },
+    {
+      "epoch": 1.3744180891154953,
+      "grad_norm": 0.17703984677791595,
+      "learning_rate": 0.00011831198456943924,
+      "loss": 0.7334,
+      "step": 6200
+    },
+    {
+      "epoch": 1.3799600975393482,
+      "grad_norm": 0.09067991375923157,
+      "learning_rate": 0.00011772346991234651,
+      "loss": 0.5874,
+      "step": 6225
+    },
+    {
+      "epoch": 1.3855021059632011,
+      "grad_norm": 0.10922655463218689,
+      "learning_rate": 0.00011713432080693772,
+      "loss": 0.746,
+      "step": 6250
+    },
+    {
+      "epoch": 1.391044114387054,
+      "grad_norm": 0.08802726864814758,
+      "learning_rate": 0.00011654455834302535,
+      "loss": 0.6084,
+      "step": 6275
+    },
+    {
+      "epoch": 1.3965861228109067,
+      "grad_norm": 0.11013362556695938,
+      "learning_rate": 0.00011595420363237844,
+      "loss": 0.7431,
+      "step": 6300
+    },
+    {
+      "epoch": 1.4021281312347595,
+      "grad_norm": 0.09353320300579071,
+      "learning_rate": 0.00011536327780796661,
+      "loss": 0.6504,
+      "step": 6325
+    },
+    {
+      "epoch": 1.4076701396586122,
+      "grad_norm": 0.160513773560524,
+      "learning_rate": 0.00011477180202320377,
+      "loss": 0.7451,
+      "step": 6350
+    },
+    {
+      "epoch": 1.413212148082465,
+      "grad_norm": 0.09337064623832703,
+      "learning_rate": 0.0001141797974511907,
+      "loss": 0.6435,
+      "step": 6375
+    },
+    {
+      "epoch": 1.418754156506318,
+      "grad_norm": 0.12163395434617996,
+      "learning_rate": 0.00011358728528395733,
+      "loss": 0.7313,
+      "step": 6400
+    },
+    {
+      "epoch": 1.4242961649301706,
+      "grad_norm": 0.08646170049905777,
+      "learning_rate": 0.00011299428673170389,
+      "loss": 0.6327,
+      "step": 6425
+    },
+    {
+      "epoch": 1.4298381733540235,
+      "grad_norm": 0.13511331379413605,
+      "learning_rate": 0.00011240082302204194,
+      "loss": 0.7324,
+      "step": 6450
+    },
+    {
+      "epoch": 1.4353801817778762,
+      "grad_norm": 0.08811099082231522,
+      "learning_rate": 0.00011180691539923407,
+      "loss": 0.6322,
+      "step": 6475
+    },
+    {
+      "epoch": 1.440922190201729,
+      "grad_norm": 0.12366902828216553,
+      "learning_rate": 0.00011121258512343391,
+      "loss": 0.7303,
+      "step": 6500
+    },
+    {
+      "epoch": 1.446464198625582,
+      "grad_norm": 0.09673753380775452,
+      "learning_rate": 0.00011061785346992463,
+      "loss": 0.6368,
+      "step": 6525
+    },
+    {
+      "epoch": 1.4520062070494348,
+      "grad_norm": 0.12419258803129196,
+      "learning_rate": 0.00011002274172835771,
+      "loss": 0.7431,
+      "step": 6550
+    },
+    {
+      "epoch": 1.4575482154732875,
+      "grad_norm": 0.09401362389326096,
+      "learning_rate": 0.00010942727120199052,
+      "loss": 0.6079,
+      "step": 6575
+    },
+    {
+      "epoch": 1.4630902238971404,
+      "grad_norm": 0.10955937206745148,
+      "learning_rate": 0.000108831463206924,
+      "loss": 0.7296,
+      "step": 6600
+    },
+    {
+      "epoch": 1.468632232320993,
+      "grad_norm": 0.09884931892156601,
+      "learning_rate": 0.00010823533907133943,
+      "loss": 0.6373,
+      "step": 6625
+    },
+    {
+      "epoch": 1.474174240744846,
+      "grad_norm": 0.14859217405319214,
+      "learning_rate": 0.00010763892013473495,
+      "loss": 0.7199,
+      "step": 6650
+    },
+    {
+      "epoch": 1.4797162491686988,
+      "grad_norm": 0.0963672623038292,
+      "learning_rate": 0.00010704222774716177,
+      "loss": 0.6156,
+      "step": 6675
+    },
+    {
+      "epoch": 1.4852582575925515,
+      "grad_norm": 0.14681296050548553,
+      "learning_rate": 0.00010644528326845988,
+      "loss": 0.74,
+      "step": 6700
+    },
+    {
+      "epoch": 1.4908002660164044,
+      "grad_norm": 0.09270428866147995,
+      "learning_rate": 0.00010584810806749327,
+      "loss": 0.6185,
+      "step": 6725
+    },
+    {
+      "epoch": 1.496342274440257,
+      "grad_norm": 0.11598405987024307,
+      "learning_rate": 0.00010525072352138526,
+      "loss": 0.7463,
+      "step": 6750
+    },
+    {
+      "epoch": 1.50188428286411,
+      "grad_norm": 0.09355468302965164,
+      "learning_rate": 0.00010465315101475295,
+      "loss": 0.5996,
+      "step": 6775
+    },
+    {
+      "epoch": 1.5074262912879628,
+      "grad_norm": 0.15108546614646912,
+      "learning_rate": 0.00010405541193894204,
+      "loss": 0.7512,
+      "step": 6800
+    },
+    {
+      "epoch": 1.5129682997118157,
+      "grad_norm": 0.08825406432151794,
+      "learning_rate": 0.00010345752769126079,
+      "loss": 0.6367,
+      "step": 6825
+    },
+    {
+      "epoch": 1.5185103081356683,
+      "grad_norm": 0.11757966130971909,
+      "learning_rate": 0.0001028595196742143,
+      "loss": 0.7556,
+      "step": 6850
+    },
+    {
+      "epoch": 1.5240523165595212,
+      "grad_norm": 0.08973203599452972,
+      "learning_rate": 0.00010226140929473813,
+      "loss": 0.6038,
+      "step": 6875
+    },
+    {
+      "epoch": 1.5295943249833739,
+      "grad_norm": 0.1337171196937561,
+      "learning_rate": 0.00010166321796343223,
+      "loss": 0.7388,
+      "step": 6900
+    },
+    {
+      "epoch": 1.5351363334072268,
+      "grad_norm": 0.08748678117990494,
+      "learning_rate": 0.0001010649670937943,
+      "loss": 0.6093,
+      "step": 6925
+    },
+    {
+      "epoch": 1.5406783418310797,
+      "grad_norm": 0.1228007897734642,
+      "learning_rate": 0.00010046667810145338,
+      "loss": 0.7494,
+      "step": 6950
+    },
+    {
+      "epoch": 1.5462203502549325,
+      "grad_norm": 0.09511099755764008,
+      "learning_rate": 9.986837240340319e-05,
+      "loss": 0.5998,
+      "step": 6975
+    },
+    {
+      "epoch": 1.5517623586787852,
+      "grad_norm": 0.12583385407924652,
+      "learning_rate": 9.927007141723548e-05,
+      "loss": 0.7266,
+      "step": 7000
+    },
+    {
+      "epoch": 1.5573043671026379,
+      "grad_norm": 0.08915423601865768,
+      "learning_rate": 9.867179656037326e-05,
+      "loss": 0.638,
+      "step": 7025
+    },
+    {
+      "epoch": 1.5628463755264907,
+      "grad_norm": 0.1239473968744278,
+      "learning_rate": 9.80735692493043e-05,
+      "loss": 0.7473,
+      "step": 7050
+    },
+    {
+      "epoch": 1.5683883839503436,
+      "grad_norm": 0.09568199515342712,
+      "learning_rate": 9.747541089881424e-05,
+      "loss": 0.6174,
+      "step": 7075
+    },
+    {
+      "epoch": 1.5739303923741965,
+      "grad_norm": 0.13295117020606995,
+      "learning_rate": 9.687734292122024e-05,
+      "loss": 0.7278,
+      "step": 7100
+    },
+    {
+      "epoch": 1.5794724007980492,
+      "grad_norm": 0.09703335911035538,
+      "learning_rate": 9.627938672560432e-05,
+      "loss": 0.6051,
+      "step": 7125
+    },
+    {
+      "epoch": 1.585014409221902,
+      "grad_norm": 0.12026989459991455,
+      "learning_rate": 9.568156371704705e-05,
+      "loss": 0.7341,
+      "step": 7150
+    },
+    {
+      "epoch": 1.5905564176457547,
+      "grad_norm": 0.09185943752527237,
+      "learning_rate": 9.508389529586128e-05,
+      "loss": 0.607,
+      "step": 7175
+    },
+    {
+      "epoch": 1.5960984260696076,
+      "grad_norm": 0.16115328669548035,
+      "learning_rate": 9.448640285682613e-05,
+      "loss": 0.7321,
+      "step": 7200
+    },
+    {
+      "epoch": 1.6016404344934605,
+      "grad_norm": 0.09262697398662567,
+      "learning_rate": 9.388910778842103e-05,
+      "loss": 0.6027,
+      "step": 7225
+    },
+    {
+      "epoch": 1.6071824429173134,
+      "grad_norm": 0.17859314382076263,
+      "learning_rate": 9.329203147206007e-05,
+      "loss": 0.7498,
+      "step": 7250
+    },
+    {
+      "epoch": 1.612724451341166,
+      "grad_norm": 0.09392908960580826,
+      "learning_rate": 9.269519528132677e-05,
+      "loss": 0.6035,
+      "step": 7275
+    },
+    {
+      "epoch": 1.6182664597650187,
+      "grad_norm": 0.1226978749036789,
+      "learning_rate": 9.209862058120879e-05,
+      "loss": 0.753,
+      "step": 7300
+    },
+    {
+      "epoch": 1.6238084681888716,
+      "grad_norm": 0.09812294691801071,
+      "learning_rate": 9.15023287273332e-05,
+      "loss": 0.631,
+      "step": 7325
+    },
+    {
+      "epoch": 1.6293504766127245,
+      "grad_norm": 0.14241814613342285,
+      "learning_rate": 9.0906341065202e-05,
+      "loss": 0.7291,
+      "step": 7350
+    },
+    {
+      "epoch": 1.6348924850365774,
+      "grad_norm": 0.09261428564786911,
+      "learning_rate": 9.031067892942805e-05,
+      "loss": 0.6204,
+      "step": 7375
+    },
+    {
+      "epoch": 1.64043449346043,
+      "grad_norm": 0.1319173127412796,
+      "learning_rate": 8.971536364297126e-05,
+      "loss": 0.7422,
+      "step": 7400
+    },
+    {
+      "epoch": 1.645976501884283,
+      "grad_norm": 0.09849465638399124,
+      "learning_rate": 8.912041651637541e-05,
+      "loss": 0.6212,
+      "step": 7425
+    },
+    {
+      "epoch": 1.6515185103081356,
+      "grad_norm": 0.1365344524383545,
+      "learning_rate": 8.852585884700519e-05,
+      "loss": 0.735,
+      "step": 7450
+    },
+    {
+      "epoch": 1.6570605187319885,
+      "grad_norm": 0.0948396623134613,
+      "learning_rate": 8.79317119182839e-05,
+      "loss": 0.6113,
+      "step": 7475
+    },
+    {
+      "epoch": 1.6626025271558413,
+      "grad_norm": 0.14469577372074127,
+      "learning_rate": 8.73379969989315e-05,
+      "loss": 0.7577,
+      "step": 7500
+    },
+    {
+      "epoch": 1.6681445355796942,
+      "grad_norm": 0.08975006639957428,
+      "learning_rate": 8.674473534220326e-05,
+      "loss": 0.6123,
+      "step": 7525
+    },
+    {
+      "epoch": 1.6736865440035469,
+      "grad_norm": 0.18550659716129303,
+      "learning_rate": 8.615194818512905e-05,
+      "loss": 0.7173,
+      "step": 7550
+    },
+    {
+      "epoch": 1.6792285524273995,
+      "grad_norm": 0.09581893682479858,
+      "learning_rate": 8.555965674775295e-05,
+      "loss": 0.6052,
+      "step": 7575
+    },
+    {
+      "epoch": 1.6847705608512524,
+      "grad_norm": 0.15174435079097748,
+      "learning_rate": 8.496788223237381e-05,
+      "loss": 0.741,
+      "step": 7600
+    },
+    {
+      "epoch": 1.6903125692751053,
+      "grad_norm": 0.09482391923666,
+      "learning_rate": 8.43766458227861e-05,
+      "loss": 0.6281,
+      "step": 7625
+    },
+    {
+      "epoch": 1.6958545776989582,
+      "grad_norm": 0.12190216034650803,
+      "learning_rate": 8.37859686835218e-05,
+      "loss": 0.7489,
+      "step": 7650
+    },
+    {
+      "epoch": 1.7013965861228109,
+      "grad_norm": 0.09474999457597733,
+      "learning_rate": 8.319587195909251e-05,
+      "loss": 0.6238,
+      "step": 7675
+    },
+    {
+      "epoch": 1.7069385945466637,
+      "grad_norm": 0.13137421011924744,
+      "learning_rate": 8.260637677323279e-05,
+      "loss": 0.7256,
+      "step": 7700
+    },
+    {
+      "epoch": 1.7124806029705164,
+      "grad_norm": 0.09658095240592957,
+      "learning_rate": 8.201750422814379e-05,
+      "loss": 0.6038,
+      "step": 7725
+    },
+    {
+      "epoch": 1.7180226113943693,
+      "grad_norm": 0.15730302035808563,
+      "learning_rate": 8.142927540373805e-05,
+      "loss": 0.7255,
+      "step": 7750
+    },
+    {
+      "epoch": 1.7235646198182222,
+      "grad_norm": 0.09805120527744293,
+      "learning_rate": 8.084171135688467e-05,
+      "loss": 0.6136,
+      "step": 7775
+    },
+    {
+      "epoch": 1.729106628242075,
+      "grad_norm": 0.14167381823062897,
+      "learning_rate": 8.02548331206558e-05,
+      "loss": 0.7246,
+      "step": 7800
+    },
+    {
+      "epoch": 1.7346486366659277,
+      "grad_norm": 0.09563518315553665,
+      "learning_rate": 7.966866170357346e-05,
+      "loss": 0.6199,
+      "step": 7825
+    },
+    {
+      "epoch": 1.7401906450897804,
+      "grad_norm": 0.11947919428348541,
+      "learning_rate": 7.908321808885766e-05,
+      "loss": 0.7284,
+      "step": 7850
+    },
+    {
+      "epoch": 1.7457326535136333,
+      "grad_norm": 0.09702473878860474,
+      "learning_rate": 7.849852323367521e-05,
+      "loss": 0.6343,
+      "step": 7875
+    },
+    {
+      "epoch": 1.7512746619374862,
+      "grad_norm": 0.16644233465194702,
+      "learning_rate": 7.791459806838957e-05,
+      "loss": 0.7295,
+      "step": 7900
+    },
+    {
+      "epoch": 1.756816670361339,
+      "grad_norm": 0.09687721729278564,
+      "learning_rate": 7.733146349581144e-05,
+      "loss": 0.6232,
+      "step": 7925
+    },
+    {
+      "epoch": 1.7623586787851917,
+      "grad_norm": 0.1432383805513382,
+      "learning_rate": 7.674914039045076e-05,
+      "loss": 0.7351,
+      "step": 7950
+    },
+    {
+      "epoch": 1.7679006872090446,
+      "grad_norm": 0.09745761752128601,
+      "learning_rate": 7.61676495977692e-05,
+      "loss": 0.5918,
+      "step": 7975
+    },
+    {
+      "epoch": 1.7734426956328972,
+      "grad_norm": 0.1507551074028015,
+      "learning_rate": 7.558701193343419e-05,
+      "loss": 0.7384,
+      "step": 8000
+    },
+    {
+      "epoch": 1.7789847040567501,
+      "grad_norm": 0.09434136003255844,
+      "learning_rate": 7.500724818257351e-05,
+      "loss": 0.5987,
+      "step": 8025
+    },
+    {
+      "epoch": 1.784526712480603,
+      "grad_norm": 0.1321529746055603,
+      "learning_rate": 7.442837909903156e-05,
+      "loss": 0.7409,
+      "step": 8050
+    },
+    {
+      "epoch": 1.790068720904456,
+      "grad_norm": 0.09421923011541367,
+      "learning_rate": 7.385042540462615e-05,
+      "loss": 0.6042,
+      "step": 8075
+    },
+    {
+      "epoch": 1.7956107293283086,
+      "grad_norm": 0.1154310330748558,
+      "learning_rate": 7.32734077884069e-05,
+      "loss": 0.7319,
+      "step": 8100
+    },
+    {
+      "epoch": 1.8011527377521612,
+      "grad_norm": 0.09810709953308105,
+      "learning_rate": 7.272037071314008e-05,
+      "loss": 0.6196,
+      "step": 8125
+    },
+    {
+      "epoch": 1.806694746176014,
+      "grad_norm": 0.12382727861404419,
+      "learning_rate": 7.2145247695974e-05,
+      "loss": 0.7366,
+      "step": 8150
+    },
+    {
+      "epoch": 1.812236754599867,
+      "grad_norm": 0.10054343193769455,
+      "learning_rate": 7.157112179736207e-05,
+      "loss": 0.638,
+      "step": 8175
+    },
+    {
+      "epoch": 1.8177787630237199,
+      "grad_norm": 0.12496750056743622,
+      "learning_rate": 7.099801356933004e-05,
+      "loss": 0.7435,
+      "step": 8200
+    },
+    {
+      "epoch": 1.8233207714475728,
+      "grad_norm": 0.09787677973508835,
+      "learning_rate": 7.0425943527474e-05,
+      "loss": 0.5983,
+      "step": 8225
+    },
+    {
+      "epoch": 1.8288627798714254,
+      "grad_norm": 0.16083738207817078,
+      "learning_rate": 6.985493215022605e-05,
+      "loss": 0.719,
+      "step": 8250
+    },
+    {
+      "epoch": 1.834404788295278,
+      "grad_norm": 0.09314879029989243,
+      "learning_rate": 6.928499987812112e-05,
+      "loss": 0.6156,
+      "step": 8275
+    },
+    {
+      "epoch": 1.839946796719131,
+      "grad_norm": 0.1240190863609314,
+      "learning_rate": 6.871616711306545e-05,
+      "loss": 0.7312,
+      "step": 8300
+    },
+    {
+      "epoch": 1.8454888051429839,
+      "grad_norm": 0.10087582468986511,
+      "learning_rate": 6.814845421760602e-05,
+      "loss": 0.5953,
+      "step": 8325
+    },
+    {
+      "epoch": 1.8510308135668367,
+      "grad_norm": 0.15391846001148224,
+      "learning_rate": 6.758188151420189e-05,
+      "loss": 0.7372,
+      "step": 8350
+    },
+    {
+      "epoch": 1.8565728219906894,
+      "grad_norm": 0.09913575649261475,
+      "learning_rate": 6.701646928449646e-05,
+      "loss": 0.5859,
+      "step": 8375
+    },
+    {
+      "epoch": 1.862114830414542,
+      "grad_norm": 0.1197889968752861,
+      "learning_rate": 6.645223776859166e-05,
+      "loss": 0.7416,
+      "step": 8400
+    },
+    {
+      "epoch": 1.867656838838395,
+      "grad_norm": 0.09789609163999557,
+      "learning_rate": 6.588920716432329e-05,
+      "loss": 0.6422,
+      "step": 8425
+    },
+    {
+      "epoch": 1.8731988472622478,
+      "grad_norm": 0.134750634431839,
+      "learning_rate": 6.532739762653804e-05,
+      "loss": 0.7462,
+      "step": 8450
+    },
+    {
+      "epoch": 1.8787408556861007,
+      "grad_norm": 0.09457212686538696,
+      "learning_rate": 6.476682926637197e-05,
+      "loss": 0.6161,
+      "step": 8475
+    },
+    {
+      "epoch": 1.8842828641099536,
+      "grad_norm": 0.18783515691757202,
+      "learning_rate": 6.420752215053065e-05,
+      "loss": 0.7236,
+      "step": 8500
+    },
+    {
+      "epoch": 1.8898248725338063,
+      "grad_norm": 0.09566831588745117,
+      "learning_rate": 6.364949630057078e-05,
+      "loss": 0.5882,
+      "step": 8525
+    },
+    {
+      "epoch": 1.895366880957659,
+      "grad_norm": 0.12137682735919952,
+      "learning_rate": 6.30927716921835e-05,
+      "loss": 0.7284,
+      "step": 8550
+    },
+    {
+      "epoch": 1.9009088893815118,
+      "grad_norm": 0.09804505854845047,
+      "learning_rate": 6.25373682544793e-05,
+      "loss": 0.6165,
+      "step": 8575
+    },
+    {
+      "epoch": 1.9064508978053647,
+      "grad_norm": 0.20505575835704803,
+      "learning_rate": 6.198330586927463e-05,
+      "loss": 0.7224,
+      "step": 8600
+    },
+    {
+      "epoch": 1.9119929062292176,
+      "grad_norm": 0.10258720070123672,
+      "learning_rate": 6.14306043703802e-05,
+      "loss": 0.5683,
+      "step": 8625
+    },
+    {
+      "epoch": 1.9175349146530702,
+      "grad_norm": 0.14654862880706787,
+      "learning_rate": 6.087928354289103e-05,
+      "loss": 0.7336,
+      "step": 8650
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.09862499684095383,
+      "learning_rate": 6.0329363122478e-05,
+      "loss": 0.5824,
+      "step": 8675
+    },
+    {
+      "epoch": 1.9286189315007758,
+      "grad_norm": 0.135267972946167,
+      "learning_rate": 5.978086279468163e-05,
+      "loss": 0.744,
+      "step": 8700
+    },
+    {
+      "epoch": 1.9341609399246287,
+      "grad_norm": 0.1033608540892601,
+      "learning_rate": 5.923380219420729e-05,
+      "loss": 0.6134,
+      "step": 8725
+    },
+    {
+      "epoch": 1.9397029483484816,
+      "grad_norm": 0.1315843164920807,
+      "learning_rate": 5.8688200904222266e-05,
+      "loss": 0.7151,
+      "step": 8750
+    },
+    {
+      "epoch": 1.9452449567723344,
+      "grad_norm": 0.09951479732990265,
+      "learning_rate": 5.8144078455654846e-05,
+      "loss": 0.622,
+      "step": 8775
+    },
+    {
+      "epoch": 1.950786965196187,
+      "grad_norm": 0.2037511169910431,
+      "learning_rate": 5.760145432649515e-05,
+      "loss": 0.7239,
+      "step": 8800
+    },
+    {
+      "epoch": 1.9563289736200398,
+      "grad_norm": 0.10025127977132797,
+      "learning_rate": 5.706034794109778e-05,
+      "loss": 0.5922,
+      "step": 8825
+    },
+    {
+      "epoch": 1.9618709820438927,
+      "grad_norm": 0.18914011120796204,
+      "learning_rate": 5.65207786694866e-05,
+      "loss": 0.7205,
+      "step": 8850
+    },
+    {
+      "epoch": 1.9674129904677455,
+      "grad_norm": 0.10025196522474289,
+      "learning_rate": 5.5982765826661256e-05,
+      "loss": 0.5814,
+      "step": 8875
+    },
+    {
+      "epoch": 1.9729549988915984,
+      "grad_norm": 0.12357232719659805,
+      "learning_rate": 5.544632867190591e-05,
+      "loss": 0.7217,
+      "step": 8900
+    },
+    {
+      "epoch": 1.978497007315451,
+      "grad_norm": 0.09909965842962265,
+      "learning_rate": 5.491148640809962e-05,
+      "loss": 0.6102,
+      "step": 8925
+    },
+    {
+      "epoch": 1.984039015739304,
+      "grad_norm": 0.16131410002708435,
+      "learning_rate": 5.437825818102902e-05,
+      "loss": 0.7193,
+      "step": 8950
+    },
+    {
+      "epoch": 1.9895810241631566,
+      "grad_norm": 0.10195192694664001,
+      "learning_rate": 5.384666307870293e-05,
+      "loss": 0.5989,
+      "step": 8975
+    },
+    {
+      "epoch": 1.9951230325870095,
+      "grad_norm": 0.1269746869802475,
+      "learning_rate": 5.331672013066922e-05,
+      "loss": 0.7287,
+      "step": 9000
+    },
+    {
+      "epoch": 2.0006650410108624,
+      "grad_norm": 0.09312586486339569,
+      "learning_rate": 5.278844830733332e-05,
+      "loss": 0.6024,
+      "step": 9025
+    },
+    {
+      "epoch": 2.0062070494347153,
+      "grad_norm": 0.10967884957790375,
+      "learning_rate": 5.226186651927938e-05,
+      "loss": 0.7053,
+      "step": 9050
+    },
+    {
+      "epoch": 2.0117490578585677,
+      "grad_norm": 0.09102078527212143,
+      "learning_rate": 5.1736993616593165e-05,
+      "loss": 0.5861,
+      "step": 9075
+    },
+    {
+      "epoch": 2.0172910662824206,
+      "grad_norm": 0.10821503400802612,
+      "learning_rate": 5.121384838818746e-05,
+      "loss": 0.6865,
+      "step": 9100
+    },
+    {
+      "epoch": 2.0228330747062735,
+      "grad_norm": 0.09317923337221146,
+      "learning_rate": 5.0692449561129285e-05,
+      "loss": 0.5912,
+      "step": 9125
+    },
+    {
+      "epoch": 2.0283750831301264,
+      "grad_norm": 0.11409013718366623,
+      "learning_rate": 5.017281579996961e-05,
+      "loss": 0.6979,
+      "step": 9150
+    },
+    {
+      "epoch": 2.0339170915539793,
+      "grad_norm": 0.09311998635530472,
+      "learning_rate": 4.965496570607523e-05,
+      "loss": 0.6235,
+      "step": 9175
+    },
+    {
+      "epoch": 2.039459099977832,
+      "grad_norm": 0.12173454463481903,
+      "learning_rate": 4.913891781696285e-05,
+      "loss": 0.7002,
+      "step": 9200
+    },
+    {
+      "epoch": 2.0450011084016846,
+      "grad_norm": 0.10086531937122345,
+      "learning_rate": 4.8624690605635626e-05,
+      "loss": 0.5569,
+      "step": 9225
+    },
+    {
+      "epoch": 2.0505431168255375,
+      "grad_norm": 0.10775309801101685,
+      "learning_rate": 4.811230247992165e-05,
+      "loss": 0.6925,
+      "step": 9250
+    },
+    {
+      "epoch": 2.0560851252493904,
+      "grad_norm": 0.096441850066185,
+      "learning_rate": 4.760177178181521e-05,
+      "loss": 0.5906,
+      "step": 9275
+    },
+    {
+      "epoch": 2.0616271336732432,
+      "grad_norm": 0.11258859932422638,
+      "learning_rate": 4.709311678682005e-05,
+      "loss": 0.6883,
+      "step": 9300
+    },
+    {
+      "epoch": 2.067169142097096,
+      "grad_norm": 0.0907958596944809,
+      "learning_rate": 4.658635570329537e-05,
+      "loss": 0.6069,
+      "step": 9325
+    },
+    {
+      "epoch": 2.0727111505209486,
+      "grad_norm": 0.11246030032634735,
+      "learning_rate": 4.608150667180378e-05,
+      "loss": 0.6951,
+      "step": 9350
+    },
+    {
+      "epoch": 2.0782531589448014,
+      "grad_norm": 0.09199715405702591,
+      "learning_rate": 4.557858776446203e-05,
+      "loss": 0.5949,
+      "step": 9375
+    },
+    {
+      "epoch": 2.0837951673686543,
+      "grad_norm": 0.11862944066524506,
+      "learning_rate": 4.50776169842941e-05,
+      "loss": 0.6701,
+      "step": 9400
+    },
+    {
+      "epoch": 2.089337175792507,
+      "grad_norm": 0.09631045907735825,
+      "learning_rate": 4.457861226458678e-05,
+      "loss": 0.6158,
+      "step": 9425
+    },
+    {
+      "epoch": 2.09487918421636,
+      "grad_norm": 0.107430100440979,
+      "learning_rate": 4.408159146824756e-05,
+      "loss": 0.6998,
+      "step": 9450
+    },
+    {
+      "epoch": 2.100421192640213,
+      "grad_norm": 0.0933394506573677,
+      "learning_rate": 4.358657238716533e-05,
+      "loss": 0.5998,
+      "step": 9475
+    },
+    {
+      "epoch": 2.1059632010640654,
+      "grad_norm": 0.11312496662139893,
+      "learning_rate": 4.309357274157338e-05,
+      "loss": 0.6909,
+      "step": 9500
+    },
+    {
+      "epoch": 2.1115052094879183,
+      "grad_norm": 0.09570565819740295,
+      "learning_rate": 4.260261017941526e-05,
+      "loss": 0.5805,
+      "step": 9525
+    },
+    {
+      "epoch": 2.117047217911771,
+      "grad_norm": 0.11327219754457474,
+      "learning_rate": 4.211370227571276e-05,
+      "loss": 0.6968,
+      "step": 9550
+    },
+    {
+      "epoch": 2.122589226335624,
+      "grad_norm": 0.09823332726955414,
+      "learning_rate": 4.162686653193698e-05,
+      "loss": 0.5965,
+      "step": 9575
+    },
+    {
+      "epoch": 2.128131234759477,
+      "grad_norm": 0.11287786811590195,
+      "learning_rate": 4.11421203753817e-05,
+      "loss": 0.6936,
+      "step": 9600
+    },
+    {
+      "epoch": 2.1336732431833294,
+      "grad_norm": 0.09058432281017303,
+      "learning_rate": 4.065948115853973e-05,
+      "loss": 0.61,
+      "step": 9625
+    },
+    {
+      "epoch": 2.1392152516071823,
+      "grad_norm": 0.11232877522706985,
+      "learning_rate": 4.017896615848149e-05,
+      "loss": 0.6852,
+      "step": 9650
+    },
+    {
+      "epoch": 2.144757260031035,
+      "grad_norm": 0.09588344395160675,
+      "learning_rate": 3.9700592576236686e-05,
+      "loss": 0.6083,
+      "step": 9675
+    },
+    {
+      "epoch": 2.150299268454888,
+      "grad_norm": 0.11739671975374222,
+      "learning_rate": 3.922437753617856e-05,
+      "loss": 0.6997,
+      "step": 9700
+    },
+    {
+      "epoch": 2.155841276878741,
+      "grad_norm": 0.09523261338472366,
+      "learning_rate": 3.875033808541083e-05,
+      "loss": 0.5693,
+      "step": 9725
+    },
+    {
+      "epoch": 2.161383285302594,
+      "grad_norm": 0.11812377721071243,
+      "learning_rate": 3.827849119315755e-05,
+      "loss": 0.6907,
+      "step": 9750
+    },
+    {
+      "epoch": 2.1669252937264463,
+      "grad_norm": 0.09816546738147736,
+      "learning_rate": 3.780885375015549e-05,
+      "loss": 0.5891,
+      "step": 9775
+    },
+    {
+      "epoch": 2.172467302150299,
+      "grad_norm": 0.11008067429065704,
+      "learning_rate": 3.734144256804978e-05,
+      "loss": 0.691,
+      "step": 9800
+    },
+    {
+      "epoch": 2.178009310574152,
+      "grad_norm": 0.09762485325336456,
+      "learning_rate": 3.687627437879177e-05,
+      "loss": 0.5914,
+      "step": 9825
+    },
+    {
+      "epoch": 2.183551318998005,
+      "grad_norm": 0.11886761337518692,
+      "learning_rate": 3.6413365834040326e-05,
+      "loss": 0.6896,
+      "step": 9850
+    },
+    {
+      "epoch": 2.189093327421858,
+      "grad_norm": 0.09332608431577682,
+      "learning_rate": 3.595273350456557e-05,
+      "loss": 0.5844,
+      "step": 9875
+    },
+    {
+      "epoch": 2.1946353358457107,
+      "grad_norm": 0.1171206533908844,
+      "learning_rate": 3.549439387965592e-05,
+      "loss": 0.7006,
+      "step": 9900
+    },
+    {
+      "epoch": 2.200177344269563,
+      "grad_norm": 0.09177059680223465,
+      "learning_rate": 3.503836336652756e-05,
+      "loss": 0.589,
+      "step": 9925
+    },
+    {
+      "epoch": 2.205719352693416,
+      "grad_norm": 0.11849093437194824,
+      "learning_rate": 3.4584658289737296e-05,
+      "loss": 0.687,
+      "step": 9950
+    },
+    {
+      "epoch": 2.211261361117269,
+      "grad_norm": 0.09985481947660446,
+      "learning_rate": 3.4133294890598065e-05,
+      "loss": 0.6102,
+      "step": 9975
+    },
+    {
+      "epoch": 2.216803369541122,
+      "grad_norm": 0.11642364412546158,
+      "learning_rate": 3.3684289326597726e-05,
+      "loss": 0.6963,
+      "step": 10000
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 13533,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.800783998233467e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-10000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a13672a0401c8d0a53efc112f53ebd65f36fa9003e87b1710879c58d881a1e1
+size 5051

checkpoint-13000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-13b-chat-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

checkpoint-13000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+  "bias": "lora_only",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.001,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-13000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:803cd263d4f529894fe732ea0e358436e92ff26b5a5500d5944ab9c8959d23e7
+size 209736952

checkpoint-13000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9c518731f7b2b57e6c39a76bdbbe149cd125abccaedddfdc7256f1b2adf6a6
+size 419529285

checkpoint-13000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:970432e3c65f4daee20d7d893efba2303b069229276460cf9b79a13b5fea3e68
+size 14575

checkpoint-13000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73ed378f484d9b731afa6442e520363c7c3802bc4262465a019c1ad46861f790
+size 627

checkpoint-13000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-13000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-13000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-13000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-13000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3673 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.881844380403458,
+  "eval_steps": 500,
+  "global_step": 13000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005542008423852805,
+      "grad_norm": 0.07243233174085617,
+      "learning_rate": 1.2315270935960592e-05,
+      "loss": 1.4594,
+      "step": 25
+    },
+    {
+      "epoch": 0.01108401684770561,
+      "grad_norm": 0.40484485030174255,
+      "learning_rate": 2.4630541871921184e-05,
+      "loss": 2.2032,
+      "step": 50
+    },
+    {
+      "epoch": 0.016626025271558414,
+      "grad_norm": 0.06850667297840118,
+      "learning_rate": 3.694581280788178e-05,
+      "loss": 1.2931,
+      "step": 75
+    },
+    {
+      "epoch": 0.02216803369541122,
+      "grad_norm": 0.4395073354244232,
+      "learning_rate": 4.926108374384237e-05,
+      "loss": 1.5698,
+      "step": 100
+    },
+    {
+      "epoch": 0.027710042119264023,
+      "grad_norm": 0.077068030834198,
+      "learning_rate": 6.157635467980296e-05,
+      "loss": 1.0537,
+      "step": 125
+    },
+    {
+      "epoch": 0.03325205054311683,
+      "grad_norm": 0.3282291293144226,
+      "learning_rate": 7.389162561576355e-05,
+      "loss": 0.9749,
+      "step": 150
+    },
+    {
+      "epoch": 0.03879405896696963,
+      "grad_norm": 0.0593000203371048,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 0.9349,
+      "step": 175
+    },
+    {
+      "epoch": 0.04433606739082244,
+      "grad_norm": 0.25612473487854004,
+      "learning_rate": 9.852216748768474e-05,
+      "loss": 0.8974,
+      "step": 200
+    },
+    {
+      "epoch": 0.04987807581467524,
+      "grad_norm": 0.0757347121834755,
+      "learning_rate": 0.00011083743842364534,
+      "loss": 0.9081,
+      "step": 225
+    },
+    {
+      "epoch": 0.055420084238528046,
+      "grad_norm": 0.14145499467849731,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 0.8607,
+      "step": 250
+    },
+    {
+      "epoch": 0.06096209266238085,
+      "grad_norm": 0.07710155844688416,
+      "learning_rate": 0.00013546798029556652,
+      "loss": 0.8973,
+      "step": 275
+    },
+    {
+      "epoch": 0.06650410108623366,
+      "grad_norm": 0.14791467785835266,
+      "learning_rate": 0.0001477832512315271,
+      "loss": 0.7924,
+      "step": 300
+    },
+    {
+      "epoch": 0.07204610951008646,
+      "grad_norm": 0.07742594182491302,
+      "learning_rate": 0.00016009852216748767,
+      "loss": 0.8698,
+      "step": 325
+    },
+    {
+      "epoch": 0.07758811793393926,
+      "grad_norm": 0.14303487539291382,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.786,
+      "step": 350
+    },
+    {
+      "epoch": 0.08313012635779206,
+      "grad_norm": 0.0865108072757721,
+      "learning_rate": 0.00018472906403940888,
+      "loss": 0.8606,
+      "step": 375
+    },
+    {
+      "epoch": 0.08867213478164487,
+      "grad_norm": 0.7533164024353027,
+      "learning_rate": 0.00019704433497536947,
+      "loss": 0.807,
+      "step": 400
+    },
+    {
+      "epoch": 0.09421414320549767,
+      "grad_norm": 0.08325570821762085,
+      "learning_rate": 0.00019999896617927833,
+      "loss": 0.8635,
+      "step": 425
+    },
+    {
+      "epoch": 0.09975615162935048,
+      "grad_norm": 0.1043543666601181,
+      "learning_rate": 0.0001999944557842899,
+      "loss": 0.7825,
+      "step": 450
+    },
+    {
+      "epoch": 0.10529816005320328,
+      "grad_norm": 0.07949995994567871,
+      "learning_rate": 0.0001999863658806385,
+      "loss": 0.8379,
+      "step": 475
+    },
+    {
+      "epoch": 0.11084016847705609,
+      "grad_norm": 0.12020070850849152,
+      "learning_rate": 0.00019997469675791905,
+      "loss": 0.768,
+      "step": 500
+    },
+    {
+      "epoch": 0.11638217690090889,
+      "grad_norm": 0.0803595781326294,
+      "learning_rate": 0.00019995944883385196,
+      "loss": 0.8487,
+      "step": 525
+    },
+    {
+      "epoch": 0.1219241853247617,
+      "grad_norm": 0.11509452760219574,
+      "learning_rate": 0.0001999406226542682,
+      "loss": 0.7787,
+      "step": 550
+    },
+    {
+      "epoch": 0.1274661937486145,
+      "grad_norm": 0.07928384840488434,
+      "learning_rate": 0.00019991821889308987,
+      "loss": 0.8357,
+      "step": 575
+    },
+    {
+      "epoch": 0.1330082021724673,
+      "grad_norm": 0.09423446655273438,
+      "learning_rate": 0.00019989223835230606,
+      "loss": 0.7564,
+      "step": 600
+    },
+    {
+      "epoch": 0.1385502105963201,
+      "grad_norm": 0.0835939422249794,
+      "learning_rate": 0.000199862681961944,
+      "loss": 0.8568,
+      "step": 625
+    },
+    {
+      "epoch": 0.1440922190201729,
+      "grad_norm": 0.09292898327112198,
+      "learning_rate": 0.0001998295507800359,
+      "loss": 0.7612,
+      "step": 650
+    },
+    {
+      "epoch": 0.1496342274440257,
+      "grad_norm": 0.07704215496778488,
+      "learning_rate": 0.00019979284599258107,
+      "loss": 0.8263,
+      "step": 675
+    },
+    {
+      "epoch": 0.15517623586787851,
+      "grad_norm": 0.10980474948883057,
+      "learning_rate": 0.0001997525689135034,
+      "loss": 0.7677,
+      "step": 700
+    },
+    {
+      "epoch": 0.16071824429173132,
+      "grad_norm": 0.08016064018011093,
+      "learning_rate": 0.0001997087209846043,
+      "loss": 0.8344,
+      "step": 725
+    },
+    {
+      "epoch": 0.16626025271558412,
+      "grad_norm": 0.0950881615281105,
+      "learning_rate": 0.0001996613037755113,
+      "loss": 0.769,
+      "step": 750
+    },
+    {
+      "epoch": 0.17180226113943692,
+      "grad_norm": 0.07932984828948975,
+      "learning_rate": 0.00019961031898362152,
+      "loss": 0.8156,
+      "step": 775
+    },
+    {
+      "epoch": 0.17734426956328975,
+      "grad_norm": 0.09336528927087784,
+      "learning_rate": 0.00019955576843404128,
+      "loss": 0.7767,
+      "step": 800
+    },
+    {
+      "epoch": 0.18288627798714255,
+      "grad_norm": 0.08560346812009811,
+      "learning_rate": 0.00019949765407952042,
+      "loss": 0.8228,
+      "step": 825
+    },
+    {
+      "epoch": 0.18842828641099535,
+      "grad_norm": 0.08475169539451599,
+      "learning_rate": 0.00019943597800038267,
+      "loss": 0.7669,
+      "step": 850
+    },
+    {
+      "epoch": 0.19397029483484815,
+      "grad_norm": 0.09038034081459045,
+      "learning_rate": 0.00019937074240445105,
+      "loss": 0.8182,
+      "step": 875
+    },
+    {
+      "epoch": 0.19951230325870095,
+      "grad_norm": 0.09195873886346817,
+      "learning_rate": 0.0001993019496269688,
+      "loss": 0.7598,
+      "step": 900
+    },
+    {
+      "epoch": 0.20505431168255375,
+      "grad_norm": 0.08655796200037003,
+      "learning_rate": 0.0001992296021305159,
+      "loss": 0.8167,
+      "step": 925
+    },
+    {
+      "epoch": 0.21059632010640655,
+      "grad_norm": 0.08353498578071594,
+      "learning_rate": 0.00019915370250492084,
+      "loss": 0.7486,
+      "step": 950
+    },
+    {
+      "epoch": 0.21613832853025935,
+      "grad_norm": 0.09225723147392273,
+      "learning_rate": 0.0001990742534671679,
+      "loss": 0.8138,
+      "step": 975
+    },
+    {
+      "epoch": 0.22168033695411218,
+      "grad_norm": 0.12104763090610504,
+      "learning_rate": 0.00019899125786129997,
+      "loss": 0.7153,
+      "step": 1000
+    },
+    {
+      "epoch": 0.22722234537796498,
+      "grad_norm": 0.0815986767411232,
+      "learning_rate": 0.00019890471865831669,
+      "loss": 0.7983,
+      "step": 1025
+    },
+    {
+      "epoch": 0.23276435380181779,
+      "grad_norm": 0.08845670521259308,
+      "learning_rate": 0.00019881463895606805,
+      "loss": 0.7187,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2383063622256706,
+      "grad_norm": 0.0821809321641922,
+      "learning_rate": 0.00019872102197914359,
+      "loss": 0.804,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2438483706495234,
+      "grad_norm": 0.08711609989404678,
+      "learning_rate": 0.00019862387107875688,
+      "loss": 0.7795,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2493903790733762,
+      "grad_norm": 0.08517508953809738,
+      "learning_rate": 0.00019852318973262567,
+      "loss": 0.7937,
+      "step": 1125
+    },
+    {
+      "epoch": 0.254932387497229,
+      "grad_norm": 0.10830071568489075,
+      "learning_rate": 0.00019841898154484726,
+      "loss": 0.7458,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2604743959210818,
+      "grad_norm": 0.08541836589574814,
+      "learning_rate": 0.0001983112502457696,
+      "loss": 0.8131,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2660164043449346,
+      "grad_norm": 0.08794037252664566,
+      "learning_rate": 0.00019819999969185762,
+      "loss": 0.7577,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2715584127687874,
+      "grad_norm": 0.08078176528215408,
+      "learning_rate": 0.00019808523386555542,
+      "loss": 0.812,
+      "step": 1225
+    },
+    {
+      "epoch": 0.2771004211926402,
+      "grad_norm": 0.09263130277395248,
+      "learning_rate": 0.0001979669568751434,
+      "loss": 0.7582,
+      "step": 1250
+    },
+    {
+      "epoch": 0.282642429616493,
+      "grad_norm": 0.08198932558298111,
+      "learning_rate": 0.00019784517295459147,
+      "loss": 0.7958,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2881844380403458,
+      "grad_norm": 0.07858102023601532,
+      "learning_rate": 0.00019771988646340725,
+      "loss": 0.7744,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2937264464641986,
+      "grad_norm": 0.0851408839225769,
+      "learning_rate": 0.00019759110188648026,
+      "loss": 0.7913,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2992684548880514,
+      "grad_norm": 0.09252189099788666,
+      "learning_rate": 0.00019745882383392116,
+      "loss": 0.7675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.30481046331190426,
+      "grad_norm": 0.08306555449962616,
+      "learning_rate": 0.0001973230570408968,
+      "loss": 0.8059,
+      "step": 1375
+    },
+    {
+      "epoch": 0.31035247173575703,
+      "grad_norm": 0.0797729641199112,
+      "learning_rate": 0.0001971838063674608,
+      "loss": 0.7424,
+      "step": 1400
+    },
+    {
+      "epoch": 0.31589448015960986,
+      "grad_norm": 0.08266165107488632,
+      "learning_rate": 0.0001970410767983794,
+      "loss": 0.7847,
+      "step": 1425
+    },
+    {
+      "epoch": 0.32143648858346263,
+      "grad_norm": 0.09364205598831177,
+      "learning_rate": 0.00019689487344295322,
+      "loss": 0.6924,
+      "step": 1450
+    },
+    {
+      "epoch": 0.32697849700731546,
+      "grad_norm": 0.08461842685937881,
+      "learning_rate": 0.00019674520153483414,
+      "loss": 0.8007,
+      "step": 1475
+    },
+    {
+      "epoch": 0.33252050543116823,
+      "grad_norm": 0.0840207040309906,
+      "learning_rate": 0.00019659206643183813,
+      "loss": 0.7139,
+      "step": 1500
+    },
+    {
+      "epoch": 0.33806251385502106,
+      "grad_norm": 0.08344192802906036,
+      "learning_rate": 0.00019643547361575343,
+      "loss": 0.7982,
+      "step": 1525
+    },
+    {
+      "epoch": 0.34360452227887384,
+      "grad_norm": 0.07934779673814774,
+      "learning_rate": 0.0001962754286921442,
+      "loss": 0.7164,
+      "step": 1550
+    },
+    {
+      "epoch": 0.34914653070272667,
+      "grad_norm": 0.08716201782226562,
+      "learning_rate": 0.00019611193739015,
+      "loss": 0.7846,
+      "step": 1575
+    },
+    {
+      "epoch": 0.3546885391265795,
+      "grad_norm": 0.08384064584970474,
+      "learning_rate": 0.0001959450055622806,
+      "loss": 0.7416,
+      "step": 1600
+    },
+    {
+      "epoch": 0.36023054755043227,
+      "grad_norm": 0.08661937713623047,
+      "learning_rate": 0.0001957746391842066,
+      "loss": 0.8075,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3657725559742851,
+      "grad_norm": 0.09327207505702972,
+      "learning_rate": 0.00019560084435454536,
+      "loss": 0.7596,
+      "step": 1650
+    },
+    {
+      "epoch": 0.37131456439813787,
+      "grad_norm": 0.08391096442937851,
+      "learning_rate": 0.00019542362729464273,
+      "loss": 0.7794,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3768565728219907,
+      "grad_norm": 0.07694080471992493,
+      "learning_rate": 0.00019524299434835052,
+      "loss": 0.7424,
+      "step": 1700
+    },
+    {
+      "epoch": 0.38239858124584347,
+      "grad_norm": 0.08567491173744202,
+      "learning_rate": 0.00019505895198179912,
+      "loss": 0.7996,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3879405896696963,
+      "grad_norm": 0.08828684687614441,
+      "learning_rate": 0.0001948715067831663,
+      "loss": 0.7394,
+      "step": 1750
+    },
+    {
+      "epoch": 0.39348259809354913,
+      "grad_norm": 0.08347714692354202,
+      "learning_rate": 0.00019468066546244117,
+      "loss": 0.7734,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3990246065174019,
+      "grad_norm": 0.07736373692750931,
+      "learning_rate": 0.00019448643485118412,
+      "loss": 0.7134,
+      "step": 1800
+    },
+    {
+      "epoch": 0.40456661494125473,
+      "grad_norm": 0.0840897262096405,
+      "learning_rate": 0.00019428882190228216,
+      "loss": 0.787,
+      "step": 1825
+    },
+    {
+      "epoch": 0.4101086233651075,
+      "grad_norm": 0.08665871620178223,
+      "learning_rate": 0.0001940878336897001,
+      "loss": 0.7151,
+      "step": 1850
+    },
+    {
+      "epoch": 0.41565063178896033,
+      "grad_norm": 0.08358912914991379,
+      "learning_rate": 0.0001938834774082274,
+      "loss": 0.7982,
+      "step": 1875
+    },
+    {
+      "epoch": 0.4211926402128131,
+      "grad_norm": 0.07928963005542755,
+      "learning_rate": 0.0001936757603732203,
+      "loss": 0.7195,
+      "step": 1900
+    },
+    {
+      "epoch": 0.42673464863666594,
+      "grad_norm": 0.08886470645666122,
+      "learning_rate": 0.00019346469002034042,
+      "loss": 0.7762,
+      "step": 1925
+    },
+    {
+      "epoch": 0.4322766570605187,
+      "grad_norm": 0.1071886494755745,
+      "learning_rate": 0.00019325027390528822,
+      "loss": 0.7453,
+      "step": 1950
+    },
+    {
+      "epoch": 0.43781866548437154,
+      "grad_norm": 0.08474262803792953,
+      "learning_rate": 0.00019303251970353261,
+      "loss": 0.7839,
+      "step": 1975
+    },
+    {
+      "epoch": 0.44336067390822437,
+      "grad_norm": 0.08803894370794296,
+      "learning_rate": 0.0001928114352100363,
+      "loss": 0.7171,
+      "step": 2000
+    },
+    {
+      "epoch": 0.44890268233207714,
+      "grad_norm": 0.08429575711488724,
+      "learning_rate": 0.00019258702833897665,
+      "loss": 0.7781,
+      "step": 2025
+    },
+    {
+      "epoch": 0.45444469075592997,
+      "grad_norm": 0.08510231226682663,
+      "learning_rate": 0.00019235930712346248,
+      "loss": 0.6949,
+      "step": 2050
+    },
+    {
+      "epoch": 0.45998669917978274,
+      "grad_norm": 0.08167176693677902,
+      "learning_rate": 0.00019212827971524634,
+      "loss": 0.7722,
+      "step": 2075
+    },
+    {
+      "epoch": 0.46552870760363557,
+      "grad_norm": 0.06542418897151947,
+      "learning_rate": 0.00019189395438443278,
+      "loss": 0.7203,
+      "step": 2100
+    },
+    {
+      "epoch": 0.47107071602748835,
+      "grad_norm": 0.08293402194976807,
+      "learning_rate": 0.00019165633951918247,
+      "loss": 0.7735,
+      "step": 2125
+    },
+    {
+      "epoch": 0.4766127244513412,
+      "grad_norm": 0.0809284895658493,
+      "learning_rate": 0.00019141544362541162,
+      "loss": 0.7412,
+      "step": 2150
+    },
+    {
+      "epoch": 0.48215473287519395,
+      "grad_norm": 0.08212891221046448,
+      "learning_rate": 0.00019117127532648773,
+      "loss": 0.7629,
+      "step": 2175
+    },
+    {
+      "epoch": 0.4876967412990468,
+      "grad_norm": 0.08602219074964523,
+      "learning_rate": 0.0001909238433629208,
+      "loss": 0.6935,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4932387497228996,
+      "grad_norm": 0.08529417216777802,
+      "learning_rate": 0.0001906731565920505,
+      "loss": 0.7915,
+      "step": 2225
+    },
+    {
+      "epoch": 0.4987807581467524,
+      "grad_norm": 0.08774964511394501,
+      "learning_rate": 0.00019041922398772897,
+      "loss": 0.7359,
+      "step": 2250
+    },
+    {
+      "epoch": 0.5043227665706052,
+      "grad_norm": 0.08649475872516632,
+      "learning_rate": 0.00019016205463999984,
+      "loss": 0.7696,
+      "step": 2275
+    },
+    {
+      "epoch": 0.509864774994458,
+      "grad_norm": 0.0878506749868393,
+      "learning_rate": 0.00018990165775477252,
+      "loss": 0.7365,
+      "step": 2300
+    },
+    {
+      "epoch": 0.5154067834183108,
+      "grad_norm": 0.09131711721420288,
+      "learning_rate": 0.0001896380426534929,
+      "loss": 0.7809,
+      "step": 2325
+    },
+    {
+      "epoch": 0.5209487918421636,
+      "grad_norm": 0.07379825413227081,
+      "learning_rate": 0.00018937121877280957,
+      "loss": 0.7029,
+      "step": 2350
+    },
+    {
+      "epoch": 0.5264908002660164,
+      "grad_norm": 0.08535836637020111,
+      "learning_rate": 0.00018910119566423598,
+      "loss": 0.7679,
+      "step": 2375
+    },
+    {
+      "epoch": 0.5320328086898692,
+      "grad_norm": 0.06719771772623062,
+      "learning_rate": 0.00018882798299380864,
+      "loss": 0.7121,
+      "step": 2400
+    },
+    {
+      "epoch": 0.537574817113722,
+      "grad_norm": 0.09019796550273895,
+      "learning_rate": 0.00018855159054174093,
+      "loss": 0.7754,
+      "step": 2425
+    },
+    {
+      "epoch": 0.5431168255375748,
+      "grad_norm": 0.08144286274909973,
+      "learning_rate": 0.0001882720282020732,
+      "loss": 0.7255,
+      "step": 2450
+    },
+    {
+      "epoch": 0.5486588339614277,
+      "grad_norm": 0.08412271738052368,
+      "learning_rate": 0.0001879893059823185,
+      "loss": 0.7722,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5542008423852804,
+      "grad_norm": 0.09016039222478867,
+      "learning_rate": 0.0001877034340031042,
+      "loss": 0.7275,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5597428508091332,
+      "grad_norm": 0.08850298821926117,
+      "learning_rate": 0.00018741442249781,
+      "loss": 0.7828,
+      "step": 2525
+    },
+    {
+      "epoch": 0.565284859232986,
+      "grad_norm": 0.06989564746618271,
+      "learning_rate": 0.00018712228181220128,
+      "loss": 0.7111,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5708268676568389,
+      "grad_norm": 0.09214618802070618,
+      "learning_rate": 0.00018682702240405906,
+      "loss": 0.7752,
+      "step": 2575
+    },
+    {
+      "epoch": 0.5763688760806917,
+      "grad_norm": 0.07766986638307571,
+      "learning_rate": 0.0001865286548428054,
+      "loss": 0.7108,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5819108845045444,
+      "grad_norm": 0.07919591665267944,
+      "learning_rate": 0.00018622718980912514,
+      "loss": 0.775,
+      "step": 2625
+    },
+    {
+      "epoch": 0.5874528929283972,
+      "grad_norm": 0.07524783164262772,
+      "learning_rate": 0.00018592263809458361,
+      "loss": 0.6941,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5929949013522501,
+      "grad_norm": 0.08549198508262634,
+      "learning_rate": 0.00018561501060124024,
+      "loss": 0.7718,
+      "step": 2675
+    },
+    {
+      "epoch": 0.5985369097761029,
+      "grad_norm": 0.08182788640260696,
+      "learning_rate": 0.0001853043183412584,
+      "loss": 0.7072,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6040789181999556,
+      "grad_norm": 0.084741972386837,
+      "learning_rate": 0.00018499057243651096,
+      "loss": 0.7478,
+      "step": 2725
+    },
+    {
+      "epoch": 0.6096209266238085,
+      "grad_norm": 0.06824459880590439,
+      "learning_rate": 0.0001846737841181825,
+      "loss": 0.7238,
+      "step": 2750
+    },
+    {
+      "epoch": 0.6151629350476613,
+      "grad_norm": 0.08315033465623856,
+      "learning_rate": 0.00018435396472636704,
+      "loss": 0.7597,
+      "step": 2775
+    },
+    {
+      "epoch": 0.6207049434715141,
+      "grad_norm": 0.07116558402776718,
+      "learning_rate": 0.00018403112570966216,
+      "loss": 0.7096,
+      "step": 2800
+    },
+    {
+      "epoch": 0.6262469518953668,
+      "grad_norm": 0.08500215411186218,
+      "learning_rate": 0.00018370527862475916,
+      "loss": 0.756,
+      "step": 2825
+    },
+    {
+      "epoch": 0.6317889603192197,
+      "grad_norm": 0.07979004830121994,
+      "learning_rate": 0.00018337643513602933,
+      "loss": 0.6886,
+      "step": 2850
+    },
+    {
+      "epoch": 0.6373309687430725,
+      "grad_norm": 0.08140358328819275,
+      "learning_rate": 0.00018304460701510652,
+      "loss": 0.7648,
+      "step": 2875
+    },
+    {
+      "epoch": 0.6428729771669253,
+      "grad_norm": 0.07779423147439957,
+      "learning_rate": 0.0001827098061404656,
+      "loss": 0.7222,
+      "step": 2900
+    },
+    {
+      "epoch": 0.6484149855907781,
+      "grad_norm": 0.08853591978549957,
+      "learning_rate": 0.0001823720444969974,
+      "loss": 0.7736,
+      "step": 2925
+    },
+    {
+      "epoch": 0.6539569940146309,
+      "grad_norm": 0.07350102066993713,
+      "learning_rate": 0.0001820313341755795,
+      "loss": 0.7256,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6594990024384837,
+      "grad_norm": 0.08152145147323608,
+      "learning_rate": 0.0001816876873726436,
+      "loss": 0.7598,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6650410108623365,
+      "grad_norm": 0.08045897632837296,
+      "learning_rate": 0.00018134111638973876,
+      "loss": 0.7275,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6705830192861894,
+      "grad_norm": 0.08514434099197388,
+      "learning_rate": 0.00018099163363309123,
+      "loss": 0.7688,
+      "step": 3025
+    },
+    {
+      "epoch": 0.6761250277100421,
+      "grad_norm": 0.060850344598293304,
+      "learning_rate": 0.00018063925161316012,
+      "loss": 0.7019,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6816670361338949,
+      "grad_norm": 0.08471492677927017,
+      "learning_rate": 0.00018028398294418977,
+      "loss": 0.7573,
+      "step": 3075
+    },
+    {
+      "epoch": 0.6872090445577477,
+      "grad_norm": 0.0642291009426117,
+      "learning_rate": 0.00017992584034375798,
+      "loss": 0.7108,
+      "step": 3100
+    },
+    {
+      "epoch": 0.6927510529816006,
+      "grad_norm": 0.09357668459415436,
+      "learning_rate": 0.000179564836632321,
+      "loss": 0.7478,
+      "step": 3125
+    },
+    {
+      "epoch": 0.6982930614054533,
+      "grad_norm": 0.07198700308799744,
+      "learning_rate": 0.00017920098473275445,
+      "loss": 0.6973,
+      "step": 3150
+    },
+    {
+      "epoch": 0.7038350698293061,
+      "grad_norm": 0.08420095592737198,
+      "learning_rate": 0.00017883429766989064,
+      "loss": 0.7487,
+      "step": 3175
+    },
+    {
+      "epoch": 0.709377078253159,
+      "grad_norm": 0.06639819592237473,
+      "learning_rate": 0.00017846478857005255,
+      "loss": 0.6741,
+      "step": 3200
+    },
+    {
+      "epoch": 0.7149190866770118,
+      "grad_norm": 0.08200914412736893,
+      "learning_rate": 0.00017809247066058378,
+      "loss": 0.7526,
+      "step": 3225
+    },
+    {
+      "epoch": 0.7204610951008645,
+      "grad_norm": 0.07311141490936279,
+      "learning_rate": 0.0001777173572693751,
+      "loss": 0.677,
+      "step": 3250
+    },
+    {
+      "epoch": 0.7260031035247173,
+      "grad_norm": 0.08722089231014252,
+      "learning_rate": 0.00017733946182438726,
+      "loss": 0.7585,
+      "step": 3275
+    },
+    {
+      "epoch": 0.7315451119485702,
+      "grad_norm": 0.06589449942111969,
+      "learning_rate": 0.00017695879785317048,
+      "loss": 0.708,
+      "step": 3300
+    },
+    {
+      "epoch": 0.737087120372423,
+      "grad_norm": 0.08262074738740921,
+      "learning_rate": 0.0001765753789823801,
+      "loss": 0.749,
+      "step": 3325
+    },
+    {
+      "epoch": 0.7426291287962757,
+      "grad_norm": 0.07514823973178864,
+      "learning_rate": 0.00017618921893728867,
+      "loss": 0.6918,
+      "step": 3350
+    },
+    {
+      "epoch": 0.7481711372201286,
+      "grad_norm": 0.08757175505161285,
+      "learning_rate": 0.00017580033154129503,
+      "loss": 0.7445,
+      "step": 3375
+    },
+    {
+      "epoch": 0.7537131456439814,
+      "grad_norm": 0.0716458335518837,
+      "learning_rate": 0.0001754087307154289,
+      "loss": 0.7122,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7592551540678342,
+      "grad_norm": 0.08453212678432465,
+      "learning_rate": 0.00017501443047785296,
+      "loss": 0.7656,
+      "step": 3425
+    },
+    {
+      "epoch": 0.7647971624916869,
+      "grad_norm": 0.06761575490236282,
+      "learning_rate": 0.00017461744494336098,
+      "loss": 0.6673,
+      "step": 3450
+    },
+    {
+      "epoch": 0.7703391709155398,
+      "grad_norm": 0.08577297627925873,
+      "learning_rate": 0.0001742177883228724,
+      "loss": 0.7494,
+      "step": 3475
+    },
+    {
+      "epoch": 0.7758811793393926,
+      "grad_norm": 0.05691730976104736,
+      "learning_rate": 0.00017381547492292376,
+      "loss": 0.6972,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7814231877632454,
+      "grad_norm": 0.09115194529294968,
+      "learning_rate": 0.00017341051914515656,
+      "loss": 0.7706,
+      "step": 3525
+    },
+    {
+      "epoch": 0.7869651961870983,
+      "grad_norm": 0.07214304804801941,
+      "learning_rate": 0.00017300293548580162,
+      "loss": 0.6807,
+      "step": 3550
+    },
+    {
+      "epoch": 0.792507204610951,
+      "grad_norm": 0.08448139578104019,
+      "learning_rate": 0.00017259273853516028,
+      "loss": 0.7661,
+      "step": 3575
+    },
+    {
+      "epoch": 0.7980492130348038,
+      "grad_norm": 0.08282499015331268,
+      "learning_rate": 0.00017217994297708195,
+      "loss": 0.7391,
+      "step": 3600
+    },
+    {
+      "epoch": 0.8035912214586566,
+      "grad_norm": 0.0804004818201065,
+      "learning_rate": 0.00017176456358843875,
+      "loss": 0.7402,
+      "step": 3625
+    },
+    {
+      "epoch": 0.8091332298825095,
+      "grad_norm": 0.07265755534172058,
+      "learning_rate": 0.00017134661523859622,
+      "loss": 0.7019,
+      "step": 3650
+    },
+    {
+      "epoch": 0.8146752383063622,
+      "grad_norm": 0.08803457766771317,
+      "learning_rate": 0.00017092611288888125,
+      "loss": 0.7572,
+      "step": 3675
+    },
+    {
+      "epoch": 0.820217246730215,
+      "grad_norm": 0.0652441680431366,
+      "learning_rate": 0.0001705030715920464,
+      "loss": 0.706,
+      "step": 3700
+    },
+    {
+      "epoch": 0.8257592551540678,
+      "grad_norm": 0.08185753971338272,
+      "learning_rate": 0.0001700775064917312,
+      "loss": 0.764,
+      "step": 3725
+    },
+    {
+      "epoch": 0.8313012635779207,
+      "grad_norm": 0.0859500914812088,
+      "learning_rate": 0.00016964943282191984,
+      "loss": 0.6927,
+      "step": 3750
+    },
+    {
+      "epoch": 0.8368432720017734,
+      "grad_norm": 0.09176376461982727,
+      "learning_rate": 0.00016921886590639602,
+      "loss": 0.7567,
+      "step": 3775
+    },
+    {
+      "epoch": 0.8423852804256262,
+      "grad_norm": 0.0646485984325409,
+      "learning_rate": 0.0001687858211581943,
+      "loss": 0.6848,
+      "step": 3800
+    },
+    {
+      "epoch": 0.8479272888494791,
+      "grad_norm": 0.08545655012130737,
+      "learning_rate": 0.00016835031407904839,
+      "loss": 0.7546,
+      "step": 3825
+    },
+    {
+      "epoch": 0.8534692972733319,
+      "grad_norm": 0.06338818371295929,
+      "learning_rate": 0.00016791236025883626,
+      "loss": 0.6655,
+      "step": 3850
+    },
+    {
+      "epoch": 0.8590113056971846,
+      "grad_norm": 0.08781229704618454,
+      "learning_rate": 0.00016747197537502205,
+      "loss": 0.7441,
+      "step": 3875
+    },
+    {
+      "epoch": 0.8645533141210374,
+      "grad_norm": 0.06220358610153198,
+      "learning_rate": 0.00016702917519209487,
+      "loss": 0.6795,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8700953225448903,
+      "grad_norm": 0.08917712420225143,
+      "learning_rate": 0.0001665839755610044,
+      "loss": 0.7552,
+      "step": 3925
+    },
+    {
+      "epoch": 0.8756373309687431,
+      "grad_norm": 0.06624036282300949,
+      "learning_rate": 0.00016613639241859355,
+      "loss": 0.6632,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8811793393925959,
+      "grad_norm": 0.08898719400167465,
+      "learning_rate": 0.00016568644178702803,
+      "loss": 0.757,
+      "step": 3975
+    },
+    {
+      "epoch": 0.8867213478164487,
+      "grad_norm": 0.05095354840159416,
+      "learning_rate": 0.0001652341397732227,
+      "loss": 0.6992,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8922633562403015,
+      "grad_norm": 0.08842916786670685,
+      "learning_rate": 0.0001647795025682649,
+      "loss": 0.7504,
+      "step": 4025
+    },
+    {
+      "epoch": 0.8978053646641543,
+      "grad_norm": 0.0758206844329834,
+      "learning_rate": 0.00016432254644683516,
+      "loss": 0.7081,
+      "step": 4050
+    },
+    {
+      "epoch": 0.903347373088007,
+      "grad_norm": 0.0940496176481247,
+      "learning_rate": 0.0001638632877666243,
+      "loss": 0.746,
+      "step": 4075
+    },
+    {
+      "epoch": 0.9088893815118599,
+      "grad_norm": 0.06626766920089722,
+      "learning_rate": 0.00016340174296774804,
+      "loss": 0.6647,
+      "step": 4100
+    },
+    {
+      "epoch": 0.9144313899357127,
+      "grad_norm": 0.08919317275285721,
+      "learning_rate": 0.00016293792857215844,
+      "loss": 0.7516,
+      "step": 4125
+    },
+    {
+      "epoch": 0.9199733983595655,
+      "grad_norm": 0.06990760564804077,
+      "learning_rate": 0.00016247186118305252,
+      "loss": 0.7011,
+      "step": 4150
+    },
+    {
+      "epoch": 0.9255154067834183,
+      "grad_norm": 0.0870794802904129,
+      "learning_rate": 0.00016200355748427782,
+      "loss": 0.7529,
+      "step": 4175
+    },
+    {
+      "epoch": 0.9310574152072711,
+      "grad_norm": 0.06882854551076889,
+      "learning_rate": 0.00016153303423973526,
+      "loss": 0.7005,
+      "step": 4200
+    },
+    {
+      "epoch": 0.9365994236311239,
+      "grad_norm": 0.084992416203022,
+      "learning_rate": 0.0001610603082927789,
+      "loss": 0.7519,
+      "step": 4225
+    },
+    {
+      "epoch": 0.9421414320549767,
+      "grad_norm": 0.0638299211859703,
+      "learning_rate": 0.00016058539656561323,
+      "loss": 0.716,
+      "step": 4250
+    },
+    {
+      "epoch": 0.9476834404788296,
+      "grad_norm": 0.08899606764316559,
+      "learning_rate": 0.00016010831605868715,
+      "loss": 0.7257,
+      "step": 4275
+    },
+    {
+      "epoch": 0.9532254489026823,
+      "grad_norm": 0.06550378352403641,
+      "learning_rate": 0.00015962908385008565,
+      "loss": 0.7174,
+      "step": 4300
+    },
+    {
+      "epoch": 0.9587674573265351,
+      "grad_norm": 0.09001540392637253,
+      "learning_rate": 0.00015914771709491828,
+      "loss": 0.7271,
+      "step": 4325
+    },
+    {
+      "epoch": 0.9643094657503879,
+      "grad_norm": 0.06641615182161331,
+      "learning_rate": 0.000158664233024705,
+      "loss": 0.69,
+      "step": 4350
+    },
+    {
+      "epoch": 0.9698514741742408,
+      "grad_norm": 0.08917039632797241,
+      "learning_rate": 0.0001581786489467596,
+      "loss": 0.7483,
+      "step": 4375
+    },
+    {
+      "epoch": 0.9753934825980936,
+      "grad_norm": 0.05995697155594826,
+      "learning_rate": 0.00015769098224356992,
+      "loss": 0.7033,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9809354910219463,
+      "grad_norm": 0.08998765051364899,
+      "learning_rate": 0.00015720125037217572,
+      "loss": 0.7462,
+      "step": 4425
+    },
+    {
+      "epoch": 0.9864774994457992,
+      "grad_norm": 0.05868702754378319,
+      "learning_rate": 0.00015670947086354376,
+      "loss": 0.6654,
+      "step": 4450
+    },
+    {
+      "epoch": 0.992019507869652,
+      "grad_norm": 0.0880926102399826,
+      "learning_rate": 0.00015621566132194005,
+      "loss": 0.752,
+      "step": 4475
+    },
+    {
+      "epoch": 0.9975615162935048,
+      "grad_norm": 0.08538970351219177,
+      "learning_rate": 0.00015571983942430005,
+      "loss": 0.7338,
+      "step": 4500
+    },
+    {
+      "epoch": 1.0031035247173576,
+      "grad_norm": 0.0827050730586052,
+      "learning_rate": 0.0001552220229195956,
+      "loss": 0.7174,
+      "step": 4525
+    },
+    {
+      "epoch": 1.0086455331412103,
+      "grad_norm": 0.10867294669151306,
+      "learning_rate": 0.00015472222962819955,
+      "loss": 0.7637,
+      "step": 4550
+    },
+    {
+      "epoch": 1.0141875415650632,
+      "grad_norm": 0.08738269656896591,
+      "learning_rate": 0.00015422047744124802,
+      "loss": 0.6247,
+      "step": 4575
+    },
+    {
+      "epoch": 1.019729549988916,
+      "grad_norm": 0.12865987420082092,
+      "learning_rate": 0.0001537167843199998,
+      "loss": 0.7424,
+      "step": 4600
+    },
+    {
+      "epoch": 1.0252715584127687,
+      "grad_norm": 0.08619695156812668,
+      "learning_rate": 0.00015321116829519345,
+      "loss": 0.6461,
+      "step": 4625
+    },
+    {
+      "epoch": 1.0308135668366216,
+      "grad_norm": 0.11726492643356323,
+      "learning_rate": 0.0001527036474664019,
+      "loss": 0.7433,
+      "step": 4650
+    },
+    {
+      "epoch": 1.0363555752604743,
+      "grad_norm": 0.08198727667331696,
+      "learning_rate": 0.0001521942400013844,
+      "loss": 0.6086,
+      "step": 4675
+    },
+    {
+      "epoch": 1.0418975836843272,
+      "grad_norm": 0.11951526254415512,
+      "learning_rate": 0.00015168296413543635,
+      "loss": 0.7521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.04743959210818,
+      "grad_norm": 0.08714735507965088,
+      "learning_rate": 0.0001511698381707363,
+      "loss": 0.631,
+      "step": 4725
+    },
+    {
+      "epoch": 1.0529816005320327,
+      "grad_norm": 0.13869455456733704,
+      "learning_rate": 0.00015065488047569107,
+      "loss": 0.7524,
+      "step": 4750
+    },
+    {
+      "epoch": 1.0585236089558856,
+      "grad_norm": 0.08524268865585327,
+      "learning_rate": 0.00015013810948427794,
+      "loss": 0.6617,
+      "step": 4775
+    },
+    {
+      "epoch": 1.0640656173797385,
+      "grad_norm": 0.11017199605703354,
+      "learning_rate": 0.00014961954369538494,
+      "loss": 0.7598,
+      "step": 4800
+    },
+    {
+      "epoch": 1.0696076258035911,
+      "grad_norm": 0.0834374874830246,
+      "learning_rate": 0.00014909920167214858,
+      "loss": 0.627,
+      "step": 4825
+    },
+    {
+      "epoch": 1.075149634227444,
+      "grad_norm": 0.1357167363166809,
+      "learning_rate": 0.0001485771020412894,
+      "loss": 0.7466,
+      "step": 4850
+    },
+    {
+      "epoch": 1.080691642651297,
+      "grad_norm": 0.08910629153251648,
+      "learning_rate": 0.00014805326349244503,
+      "loss": 0.6238,
+      "step": 4875
+    },
+    {
+      "epoch": 1.0862336510751496,
+      "grad_norm": 0.10706546157598495,
+      "learning_rate": 0.00014752770477750144,
+      "loss": 0.7533,
+      "step": 4900
+    },
+    {
+      "epoch": 1.0917756594990025,
+      "grad_norm": 0.09201759845018387,
+      "learning_rate": 0.00014700044470992136,
+      "loss": 0.6521,
+      "step": 4925
+    },
+    {
+      "epoch": 1.0973176679228553,
+      "grad_norm": 0.14048361778259277,
+      "learning_rate": 0.00014647150216407106,
+      "loss": 0.7412,
+      "step": 4950
+    },
+    {
+      "epoch": 1.102859676346708,
+      "grad_norm": 0.08308299630880356,
+      "learning_rate": 0.00014594089607454454,
+      "loss": 0.6333,
+      "step": 4975
+    },
+    {
+      "epoch": 1.108401684770561,
+      "grad_norm": 0.12057497352361679,
+      "learning_rate": 0.00014540864543548582,
+      "loss": 0.7538,
+      "step": 5000
+    },
+    {
+      "epoch": 1.1139436931944136,
+      "grad_norm": 0.089565709233284,
+      "learning_rate": 0.00014487476929990898,
+      "loss": 0.6662,
+      "step": 5025
+    },
+    {
+      "epoch": 1.1194857016182664,
+      "grad_norm": 0.12125346809625626,
+      "learning_rate": 0.00014433928677901612,
+      "loss": 0.7653,
+      "step": 5050
+    },
+    {
+      "epoch": 1.1250277100421193,
+      "grad_norm": 0.08421044796705246,
+      "learning_rate": 0.00014380221704151318,
+      "loss": 0.615,
+      "step": 5075
+    },
+    {
+      "epoch": 1.130569718465972,
+      "grad_norm": 0.12215881794691086,
+      "learning_rate": 0.0001432635793129239,
+      "loss": 0.7482,
+      "step": 5100
+    },
+    {
+      "epoch": 1.1361117268898249,
+      "grad_norm": 0.08646813780069351,
+      "learning_rate": 0.0001427233928749014,
+      "loss": 0.6292,
+      "step": 5125
+    },
+    {
+      "epoch": 1.1416537353136778,
+      "grad_norm": 0.11372750997543335,
+      "learning_rate": 0.00014218167706453816,
+      "loss": 0.7487,
+      "step": 5150
+    },
+    {
+      "epoch": 1.1471957437375304,
+      "grad_norm": 0.08925063908100128,
+      "learning_rate": 0.00014163845127367362,
+      "loss": 0.6336,
+      "step": 5175
+    },
+    {
+      "epoch": 1.1527377521613833,
+      "grad_norm": 0.12316026538610458,
+      "learning_rate": 0.00014109373494820018,
+      "loss": 0.7566,
+      "step": 5200
+    },
+    {
+      "epoch": 1.1582797605852362,
+      "grad_norm": 0.08784055709838867,
+      "learning_rate": 0.00014054754758736698,
+      "loss": 0.6124,
+      "step": 5225
+    },
+    {
+      "epoch": 1.1638217690090888,
+      "grad_norm": 0.11267346143722534,
+      "learning_rate": 0.0001399999087430819,
+      "loss": 0.7611,
+      "step": 5250
+    },
+    {
+      "epoch": 1.1693637774329417,
+      "grad_norm": 0.08636374026536942,
+      "learning_rate": 0.00013945083801921167,
+      "loss": 0.6561,
+      "step": 5275
+    },
+    {
+      "epoch": 1.1749057858567946,
+      "grad_norm": 0.13902850449085236,
+      "learning_rate": 0.0001389003550708802,
+      "loss": 0.7546,
+      "step": 5300
+    },
+    {
+      "epoch": 1.1804477942806473,
+      "grad_norm": 0.09029743075370789,
+      "learning_rate": 0.0001383484796037648,
+      "loss": 0.6226,
+      "step": 5325
+    },
+    {
+      "epoch": 1.1859898027045002,
+      "grad_norm": 0.10377778112888336,
+      "learning_rate": 0.00013779523137339095,
+      "loss": 0.7422,
+      "step": 5350
+    },
+    {
+      "epoch": 1.1915318111283528,
+      "grad_norm": 0.08857985585927963,
+      "learning_rate": 0.00013724063018442494,
+      "loss": 0.6346,
+      "step": 5375
+    },
+    {
+      "epoch": 1.1970738195522057,
+      "grad_norm": 0.15107670426368713,
+      "learning_rate": 0.0001366846958899651,
+      "loss": 0.7266,
+      "step": 5400
+    },
+    {
+      "epoch": 1.2026158279760586,
+      "grad_norm": 0.09053236246109009,
+      "learning_rate": 0.000136127448390831,
+      "loss": 0.617,
+      "step": 5425
+    },
+    {
+      "epoch": 1.2081578363999113,
+      "grad_norm": 0.11061497032642365,
+      "learning_rate": 0.00013556890763485112,
+      "loss": 0.7631,
+      "step": 5450
+    },
+    {
+      "epoch": 1.2136998448237641,
+      "grad_norm": 0.08849512785673141,
+      "learning_rate": 0.0001350090936161487,
+      "loss": 0.5992,
+      "step": 5475
+    },
+    {
+      "epoch": 1.219241853247617,
+      "grad_norm": 0.12802088260650635,
+      "learning_rate": 0.00013444802637442606,
+      "loss": 0.7389,
+      "step": 5500
+    },
+    {
+      "epoch": 1.2247838616714697,
+      "grad_norm": 0.08612997084856033,
+      "learning_rate": 0.0001338857259942473,
+      "loss": 0.6173,
+      "step": 5525
+    },
+    {
+      "epoch": 1.2303258700953226,
+      "grad_norm": 0.1339733898639679,
+      "learning_rate": 0.0001333222126043192,
+      "loss": 0.7473,
+      "step": 5550
+    },
+    {
+      "epoch": 1.2358678785191755,
+      "grad_norm": 0.08974476903676987,
+      "learning_rate": 0.00013275750637677073,
+      "loss": 0.6224,
+      "step": 5575
+    },
+    {
+      "epoch": 1.2414098869430281,
+      "grad_norm": 0.1431790143251419,
+      "learning_rate": 0.00013219162752643103,
+      "loss": 0.748,
+      "step": 5600
+    },
+    {
+      "epoch": 1.246951895366881,
+      "grad_norm": 0.08545912057161331,
+      "learning_rate": 0.0001316245963101056,
+      "loss": 0.5969,
+      "step": 5625
+    },
+    {
+      "epoch": 1.2524939037907337,
+      "grad_norm": 0.13927248120307922,
+      "learning_rate": 0.00013105643302585137,
+      "loss": 0.752,
+      "step": 5650
+    },
+    {
+      "epoch": 1.2580359122145865,
+      "grad_norm": 0.08908078819513321,
+      "learning_rate": 0.0001304871580122499,
+      "loss": 0.6422,
+      "step": 5675
+    },
+    {
+      "epoch": 1.2635779206384394,
+      "grad_norm": 0.13006670773029327,
+      "learning_rate": 0.00012991679164767942,
+      "loss": 0.7594,
+      "step": 5700
+    },
+    {
+      "epoch": 1.269119929062292,
+      "grad_norm": 0.08754415810108185,
+      "learning_rate": 0.00012934535434958538,
+      "loss": 0.6121,
+      "step": 5725
+    },
+    {
+      "epoch": 1.274661937486145,
+      "grad_norm": 0.15051035583019257,
+      "learning_rate": 0.0001287728665737495,
+      "loss": 0.7414,
+      "step": 5750
+    },
+    {
+      "epoch": 1.2802039459099979,
+      "grad_norm": 0.08734755963087082,
+      "learning_rate": 0.00012819934881355745,
+      "loss": 0.6368,
+      "step": 5775
+    },
+    {
+      "epoch": 1.2857459543338505,
+      "grad_norm": 0.17122209072113037,
+      "learning_rate": 0.0001276248215992654,
+      "loss": 0.7535,
+      "step": 5800
+    },
+    {
+      "epoch": 1.2912879627577034,
+      "grad_norm": 0.08958180993795395,
+      "learning_rate": 0.00012704930549726503,
+      "loss": 0.5941,
+      "step": 5825
+    },
+    {
+      "epoch": 1.2968299711815563,
+      "grad_norm": 0.10746924579143524,
+      "learning_rate": 0.0001264728211093473,
+      "loss": 0.7498,
+      "step": 5850
+    },
+    {
+      "epoch": 1.302371979605409,
+      "grad_norm": 0.08696026355028152,
+      "learning_rate": 0.00012589538907196486,
+      "loss": 0.6451,
+      "step": 5875
+    },
+    {
+      "epoch": 1.3079139880292618,
+      "grad_norm": 0.14526303112506866,
+      "learning_rate": 0.0001253170300554936,
+      "loss": 0.7427,
+      "step": 5900
+    },
+    {
+      "epoch": 1.3134559964531145,
+      "grad_norm": 0.08736992627382278,
+      "learning_rate": 0.0001247377647634924,
+      "loss": 0.632,
+      "step": 5925
+    },
+    {
+      "epoch": 1.3189980048769674,
+      "grad_norm": 0.13362443447113037,
+      "learning_rate": 0.00012415761393196227,
+      "loss": 0.7394,
+      "step": 5950
+    },
+    {
+      "epoch": 1.3245400133008203,
+      "grad_norm": 0.09453903138637543,
+      "learning_rate": 0.00012357659832860386,
+      "loss": 0.6394,
+      "step": 5975
+    },
+    {
+      "epoch": 1.3300820217246732,
+      "grad_norm": 0.1354217529296875,
+      "learning_rate": 0.00012299473875207416,
+      "loss": 0.727,
+      "step": 6000
+    },
+    {
+      "epoch": 1.3356240301485258,
+      "grad_norm": 0.09107760339975357,
+      "learning_rate": 0.0001224120560312419,
+      "loss": 0.6197,
+      "step": 6025
+    },
+    {
+      "epoch": 1.3411660385723787,
+      "grad_norm": 0.12501803040504456,
+      "learning_rate": 0.00012182857102444203,
+      "loss": 0.7475,
+      "step": 6050
+    },
+    {
+      "epoch": 1.3467080469962314,
+      "grad_norm": 0.08888363093137741,
+      "learning_rate": 0.00012124430461872886,
+      "loss": 0.6108,
+      "step": 6075
+    },
+    {
+      "epoch": 1.3522500554200843,
+      "grad_norm": 0.16767624020576477,
+      "learning_rate": 0.00012065927772912863,
+      "loss": 0.7408,
+      "step": 6100
+    },
+    {
+      "epoch": 1.3577920638439371,
+      "grad_norm": 0.09112541377544403,
+      "learning_rate": 0.00012007351129789062,
+      "loss": 0.5868,
+      "step": 6125
+    },
+    {
+      "epoch": 1.3633340722677898,
+      "grad_norm": 0.13539327681064606,
+      "learning_rate": 0.0001194870262937375,
+      "loss": 0.7505,
+      "step": 6150
+    },
+    {
+      "epoch": 1.3688760806916427,
+      "grad_norm": 0.08977732807397842,
+      "learning_rate": 0.00011889984371111475,
+      "loss": 0.5985,
+      "step": 6175
+    },
+    {
+      "epoch": 1.3744180891154953,
+      "grad_norm": 0.17703984677791595,
+      "learning_rate": 0.00011831198456943924,
+      "loss": 0.7334,
+      "step": 6200
+    },
+    {
+      "epoch": 1.3799600975393482,
+      "grad_norm": 0.09067991375923157,
+      "learning_rate": 0.00011772346991234651,
+      "loss": 0.5874,
+      "step": 6225
+    },
+    {
+      "epoch": 1.3855021059632011,
+      "grad_norm": 0.10922655463218689,
+      "learning_rate": 0.00011713432080693772,
+      "loss": 0.746,
+      "step": 6250
+    },
+    {
+      "epoch": 1.391044114387054,
+      "grad_norm": 0.08802726864814758,
+      "learning_rate": 0.00011654455834302535,
+      "loss": 0.6084,
+      "step": 6275
+    },
+    {
+      "epoch": 1.3965861228109067,
+      "grad_norm": 0.11013362556695938,
+      "learning_rate": 0.00011595420363237844,
+      "loss": 0.7431,
+      "step": 6300
+    },
+    {
+      "epoch": 1.4021281312347595,
+      "grad_norm": 0.09353320300579071,
+      "learning_rate": 0.00011536327780796661,
+      "loss": 0.6504,
+      "step": 6325
+    },
+    {
+      "epoch": 1.4076701396586122,
+      "grad_norm": 0.160513773560524,
+      "learning_rate": 0.00011477180202320377,
+      "loss": 0.7451,
+      "step": 6350
+    },
+    {
+      "epoch": 1.413212148082465,
+      "grad_norm": 0.09337064623832703,
+      "learning_rate": 0.0001141797974511907,
+      "loss": 0.6435,
+      "step": 6375
+    },
+    {
+      "epoch": 1.418754156506318,
+      "grad_norm": 0.12163395434617996,
+      "learning_rate": 0.00011358728528395733,
+      "loss": 0.7313,
+      "step": 6400
+    },
+    {
+      "epoch": 1.4242961649301706,
+      "grad_norm": 0.08646170049905777,
+      "learning_rate": 0.00011299428673170389,
+      "loss": 0.6327,
+      "step": 6425
+    },
+    {
+      "epoch": 1.4298381733540235,
+      "grad_norm": 0.13511331379413605,
+      "learning_rate": 0.00011240082302204194,
+      "loss": 0.7324,
+      "step": 6450
+    },
+    {
+      "epoch": 1.4353801817778762,
+      "grad_norm": 0.08811099082231522,
+      "learning_rate": 0.00011180691539923407,
+      "loss": 0.6322,
+      "step": 6475
+    },
+    {
+      "epoch": 1.440922190201729,
+      "grad_norm": 0.12366902828216553,
+      "learning_rate": 0.00011121258512343391,
+      "loss": 0.7303,
+      "step": 6500
+    },
+    {
+      "epoch": 1.446464198625582,
+      "grad_norm": 0.09673753380775452,
+      "learning_rate": 0.00011061785346992463,
+      "loss": 0.6368,
+      "step": 6525
+    },
+    {
+      "epoch": 1.4520062070494348,
+      "grad_norm": 0.12419258803129196,
+      "learning_rate": 0.00011002274172835771,
+      "loss": 0.7431,
+      "step": 6550
+    },
+    {
+      "epoch": 1.4575482154732875,
+      "grad_norm": 0.09401362389326096,
+      "learning_rate": 0.00010942727120199052,
+      "loss": 0.6079,
+      "step": 6575
+    },
+    {
+      "epoch": 1.4630902238971404,
+      "grad_norm": 0.10955937206745148,
+      "learning_rate": 0.000108831463206924,
+      "loss": 0.7296,
+      "step": 6600
+    },
+    {
+      "epoch": 1.468632232320993,
+      "grad_norm": 0.09884931892156601,
+      "learning_rate": 0.00010823533907133943,
+      "loss": 0.6373,
+      "step": 6625
+    },
+    {
+      "epoch": 1.474174240744846,
+      "grad_norm": 0.14859217405319214,
+      "learning_rate": 0.00010763892013473495,
+      "loss": 0.7199,
+      "step": 6650
+    },
+    {
+      "epoch": 1.4797162491686988,
+      "grad_norm": 0.0963672623038292,
+      "learning_rate": 0.00010704222774716177,
+      "loss": 0.6156,
+      "step": 6675
+    },
+    {
+      "epoch": 1.4852582575925515,
+      "grad_norm": 0.14681296050548553,
+      "learning_rate": 0.00010644528326845988,
+      "loss": 0.74,
+      "step": 6700
+    },
+    {
+      "epoch": 1.4908002660164044,
+      "grad_norm": 0.09270428866147995,
+      "learning_rate": 0.00010584810806749327,
+      "loss": 0.6185,
+      "step": 6725
+    },
+    {
+      "epoch": 1.496342274440257,
+      "grad_norm": 0.11598405987024307,
+      "learning_rate": 0.00010525072352138526,
+      "loss": 0.7463,
+      "step": 6750
+    },
+    {
+      "epoch": 1.50188428286411,
+      "grad_norm": 0.09355468302965164,
+      "learning_rate": 0.00010465315101475295,
+      "loss": 0.5996,
+      "step": 6775
+    },
+    {
+      "epoch": 1.5074262912879628,
+      "grad_norm": 0.15108546614646912,
+      "learning_rate": 0.00010405541193894204,
+      "loss": 0.7512,
+      "step": 6800
+    },
+    {
+      "epoch": 1.5129682997118157,
+      "grad_norm": 0.08825406432151794,
+      "learning_rate": 0.00010345752769126079,
+      "loss": 0.6367,
+      "step": 6825
+    },
+    {
+      "epoch": 1.5185103081356683,
+      "grad_norm": 0.11757966130971909,
+      "learning_rate": 0.0001028595196742143,
+      "loss": 0.7556,
+      "step": 6850
+    },
+    {
+      "epoch": 1.5240523165595212,
+      "grad_norm": 0.08973203599452972,
+      "learning_rate": 0.00010226140929473813,
+      "loss": 0.6038,
+      "step": 6875
+    },
+    {
+      "epoch": 1.5295943249833739,
+      "grad_norm": 0.1337171196937561,
+      "learning_rate": 0.00010166321796343223,
+      "loss": 0.7388,
+      "step": 6900
+    },
+    {
+      "epoch": 1.5351363334072268,
+      "grad_norm": 0.08748678117990494,
+      "learning_rate": 0.0001010649670937943,
+      "loss": 0.6093,
+      "step": 6925
+    },
+    {
+      "epoch": 1.5406783418310797,
+      "grad_norm": 0.1228007897734642,
+      "learning_rate": 0.00010046667810145338,
+      "loss": 0.7494,
+      "step": 6950
+    },
+    {
+      "epoch": 1.5462203502549325,
+      "grad_norm": 0.09511099755764008,
+      "learning_rate": 9.986837240340319e-05,
+      "loss": 0.5998,
+      "step": 6975
+    },
+    {
+      "epoch": 1.5517623586787852,
+      "grad_norm": 0.12583385407924652,
+      "learning_rate": 9.927007141723548e-05,
+      "loss": 0.7266,
+      "step": 7000
+    },
+    {
+      "epoch": 1.5573043671026379,
+      "grad_norm": 0.08915423601865768,
+      "learning_rate": 9.867179656037326e-05,
+      "loss": 0.638,
+      "step": 7025
+    },
+    {
+      "epoch": 1.5628463755264907,
+      "grad_norm": 0.1239473968744278,
+      "learning_rate": 9.80735692493043e-05,
+      "loss": 0.7473,
+      "step": 7050
+    },
+    {
+      "epoch": 1.5683883839503436,
+      "grad_norm": 0.09568199515342712,
+      "learning_rate": 9.747541089881424e-05,
+      "loss": 0.6174,
+      "step": 7075
+    },
+    {
+      "epoch": 1.5739303923741965,
+      "grad_norm": 0.13295117020606995,
+      "learning_rate": 9.687734292122024e-05,
+      "loss": 0.7278,
+      "step": 7100
+    },
+    {
+      "epoch": 1.5794724007980492,
+      "grad_norm": 0.09703335911035538,
+      "learning_rate": 9.627938672560432e-05,
+      "loss": 0.6051,
+      "step": 7125
+    },
+    {
+      "epoch": 1.585014409221902,
+      "grad_norm": 0.12026989459991455,
+      "learning_rate": 9.568156371704705e-05,
+      "loss": 0.7341,
+      "step": 7150
+    },
+    {
+      "epoch": 1.5905564176457547,
+      "grad_norm": 0.09185943752527237,
+      "learning_rate": 9.508389529586128e-05,
+      "loss": 0.607,
+      "step": 7175
+    },
+    {
+      "epoch": 1.5960984260696076,
+      "grad_norm": 0.16115328669548035,
+      "learning_rate": 9.448640285682613e-05,
+      "loss": 0.7321,
+      "step": 7200
+    },
+    {
+      "epoch": 1.6016404344934605,
+      "grad_norm": 0.09262697398662567,
+      "learning_rate": 9.388910778842103e-05,
+      "loss": 0.6027,
+      "step": 7225
+    },
+    {
+      "epoch": 1.6071824429173134,
+      "grad_norm": 0.17859314382076263,
+      "learning_rate": 9.329203147206007e-05,
+      "loss": 0.7498,
+      "step": 7250
+    },
+    {
+      "epoch": 1.612724451341166,
+      "grad_norm": 0.09392908960580826,
+      "learning_rate": 9.269519528132677e-05,
+      "loss": 0.6035,
+      "step": 7275
+    },
+    {
+      "epoch": 1.6182664597650187,
+      "grad_norm": 0.1226978749036789,
+      "learning_rate": 9.209862058120879e-05,
+      "loss": 0.753,
+      "step": 7300
+    },
+    {
+      "epoch": 1.6238084681888716,
+      "grad_norm": 0.09812294691801071,
+      "learning_rate": 9.15023287273332e-05,
+      "loss": 0.631,
+      "step": 7325
+    },
+    {
+      "epoch": 1.6293504766127245,
+      "grad_norm": 0.14241814613342285,
+      "learning_rate": 9.0906341065202e-05,
+      "loss": 0.7291,
+      "step": 7350
+    },
+    {
+      "epoch": 1.6348924850365774,
+      "grad_norm": 0.09261428564786911,
+      "learning_rate": 9.031067892942805e-05,
+      "loss": 0.6204,
+      "step": 7375
+    },
+    {
+      "epoch": 1.64043449346043,
+      "grad_norm": 0.1319173127412796,
+      "learning_rate": 8.971536364297126e-05,
+      "loss": 0.7422,
+      "step": 7400
+    },
+    {
+      "epoch": 1.645976501884283,
+      "grad_norm": 0.09849465638399124,
+      "learning_rate": 8.912041651637541e-05,
+      "loss": 0.6212,
+      "step": 7425
+    },
+    {
+      "epoch": 1.6515185103081356,
+      "grad_norm": 0.1365344524383545,
+      "learning_rate": 8.852585884700519e-05,
+      "loss": 0.735,
+      "step": 7450
+    },
+    {
+      "epoch": 1.6570605187319885,
+      "grad_norm": 0.0948396623134613,
+      "learning_rate": 8.79317119182839e-05,
+      "loss": 0.6113,
+      "step": 7475
+    },
+    {
+      "epoch": 1.6626025271558413,
+      "grad_norm": 0.14469577372074127,
+      "learning_rate": 8.73379969989315e-05,
+      "loss": 0.7577,
+      "step": 7500
+    },
+    {
+      "epoch": 1.6681445355796942,
+      "grad_norm": 0.08975006639957428,
+      "learning_rate": 8.674473534220326e-05,
+      "loss": 0.6123,
+      "step": 7525
+    },
+    {
+      "epoch": 1.6736865440035469,
+      "grad_norm": 0.18550659716129303,
+      "learning_rate": 8.615194818512905e-05,
+      "loss": 0.7173,
+      "step": 7550
+    },
+    {
+      "epoch": 1.6792285524273995,
+      "grad_norm": 0.09581893682479858,
+      "learning_rate": 8.555965674775295e-05,
+      "loss": 0.6052,
+      "step": 7575
+    },
+    {
+      "epoch": 1.6847705608512524,
+      "grad_norm": 0.15174435079097748,
+      "learning_rate": 8.496788223237381e-05,
+      "loss": 0.741,
+      "step": 7600
+    },
+    {
+      "epoch": 1.6903125692751053,
+      "grad_norm": 0.09482391923666,
+      "learning_rate": 8.43766458227861e-05,
+      "loss": 0.6281,
+      "step": 7625
+    },
+    {
+      "epoch": 1.6958545776989582,
+      "grad_norm": 0.12190216034650803,
+      "learning_rate": 8.37859686835218e-05,
+      "loss": 0.7489,
+      "step": 7650
+    },
+    {
+      "epoch": 1.7013965861228109,
+      "grad_norm": 0.09474999457597733,
+      "learning_rate": 8.319587195909251e-05,
+      "loss": 0.6238,
+      "step": 7675
+    },
+    {
+      "epoch": 1.7069385945466637,
+      "grad_norm": 0.13137421011924744,
+      "learning_rate": 8.260637677323279e-05,
+      "loss": 0.7256,
+      "step": 7700
+    },
+    {
+      "epoch": 1.7124806029705164,
+      "grad_norm": 0.09658095240592957,
+      "learning_rate": 8.201750422814379e-05,
+      "loss": 0.6038,
+      "step": 7725
+    },
+    {
+      "epoch": 1.7180226113943693,
+      "grad_norm": 0.15730302035808563,
+      "learning_rate": 8.142927540373805e-05,
+      "loss": 0.7255,
+      "step": 7750
+    },
+    {
+      "epoch": 1.7235646198182222,
+      "grad_norm": 0.09805120527744293,
+      "learning_rate": 8.084171135688467e-05,
+      "loss": 0.6136,
+      "step": 7775
+    },
+    {
+      "epoch": 1.729106628242075,
+      "grad_norm": 0.14167381823062897,
+      "learning_rate": 8.02548331206558e-05,
+      "loss": 0.7246,
+      "step": 7800
+    },
+    {
+      "epoch": 1.7346486366659277,
+      "grad_norm": 0.09563518315553665,
+      "learning_rate": 7.966866170357346e-05,
+      "loss": 0.6199,
+      "step": 7825
+    },
+    {
+      "epoch": 1.7401906450897804,
+      "grad_norm": 0.11947919428348541,
+      "learning_rate": 7.908321808885766e-05,
+      "loss": 0.7284,
+      "step": 7850
+    },
+    {
+      "epoch": 1.7457326535136333,
+      "grad_norm": 0.09702473878860474,
+      "learning_rate": 7.849852323367521e-05,
+      "loss": 0.6343,
+      "step": 7875
+    },
+    {
+      "epoch": 1.7512746619374862,
+      "grad_norm": 0.16644233465194702,
+      "learning_rate": 7.791459806838957e-05,
+      "loss": 0.7295,
+      "step": 7900
+    },
+    {
+      "epoch": 1.756816670361339,
+      "grad_norm": 0.09687721729278564,
+      "learning_rate": 7.733146349581144e-05,
+      "loss": 0.6232,
+      "step": 7925
+    },
+    {
+      "epoch": 1.7623586787851917,
+      "grad_norm": 0.1432383805513382,
+      "learning_rate": 7.674914039045076e-05,
+      "loss": 0.7351,
+      "step": 7950
+    },
+    {
+      "epoch": 1.7679006872090446,
+      "grad_norm": 0.09745761752128601,
+      "learning_rate": 7.61676495977692e-05,
+      "loss": 0.5918,
+      "step": 7975
+    },
+    {
+      "epoch": 1.7734426956328972,
+      "grad_norm": 0.1507551074028015,
+      "learning_rate": 7.558701193343419e-05,
+      "loss": 0.7384,
+      "step": 8000
+    },
+    {
+      "epoch": 1.7789847040567501,
+      "grad_norm": 0.09434136003255844,
+      "learning_rate": 7.500724818257351e-05,
+      "loss": 0.5987,
+      "step": 8025
+    },
+    {
+      "epoch": 1.784526712480603,
+      "grad_norm": 0.1321529746055603,
+      "learning_rate": 7.442837909903156e-05,
+      "loss": 0.7409,
+      "step": 8050
+    },
+    {
+      "epoch": 1.790068720904456,
+      "grad_norm": 0.09421923011541367,
+      "learning_rate": 7.385042540462615e-05,
+      "loss": 0.6042,
+      "step": 8075
+    },
+    {
+      "epoch": 1.7956107293283086,
+      "grad_norm": 0.1154310330748558,
+      "learning_rate": 7.32734077884069e-05,
+      "loss": 0.7319,
+      "step": 8100
+    },
+    {
+      "epoch": 1.8011527377521612,
+      "grad_norm": 0.09810709953308105,
+      "learning_rate": 7.272037071314008e-05,
+      "loss": 0.6196,
+      "step": 8125
+    },
+    {
+      "epoch": 1.806694746176014,
+      "grad_norm": 0.12382727861404419,
+      "learning_rate": 7.2145247695974e-05,
+      "loss": 0.7366,
+      "step": 8150
+    },
+    {
+      "epoch": 1.812236754599867,
+      "grad_norm": 0.10054343193769455,
+      "learning_rate": 7.157112179736207e-05,
+      "loss": 0.638,
+      "step": 8175
+    },
+    {
+      "epoch": 1.8177787630237199,
+      "grad_norm": 0.12496750056743622,
+      "learning_rate": 7.099801356933004e-05,
+      "loss": 0.7435,
+      "step": 8200
+    },
+    {
+      "epoch": 1.8233207714475728,
+      "grad_norm": 0.09787677973508835,
+      "learning_rate": 7.0425943527474e-05,
+      "loss": 0.5983,
+      "step": 8225
+    },
+    {
+      "epoch": 1.8288627798714254,
+      "grad_norm": 0.16083738207817078,
+      "learning_rate": 6.985493215022605e-05,
+      "loss": 0.719,
+      "step": 8250
+    },
+    {
+      "epoch": 1.834404788295278,
+      "grad_norm": 0.09314879029989243,
+      "learning_rate": 6.928499987812112e-05,
+      "loss": 0.6156,
+      "step": 8275
+    },
+    {
+      "epoch": 1.839946796719131,
+      "grad_norm": 0.1240190863609314,
+      "learning_rate": 6.871616711306545e-05,
+      "loss": 0.7312,
+      "step": 8300
+    },
+    {
+      "epoch": 1.8454888051429839,
+      "grad_norm": 0.10087582468986511,
+      "learning_rate": 6.814845421760602e-05,
+      "loss": 0.5953,
+      "step": 8325
+    },
+    {
+      "epoch": 1.8510308135668367,
+      "grad_norm": 0.15391846001148224,
+      "learning_rate": 6.758188151420189e-05,
+      "loss": 0.7372,
+      "step": 8350
+    },
+    {
+      "epoch": 1.8565728219906894,
+      "grad_norm": 0.09913575649261475,
+      "learning_rate": 6.701646928449646e-05,
+      "loss": 0.5859,
+      "step": 8375
+    },
+    {
+      "epoch": 1.862114830414542,
+      "grad_norm": 0.1197889968752861,
+      "learning_rate": 6.645223776859166e-05,
+      "loss": 0.7416,
+      "step": 8400
+    },
+    {
+      "epoch": 1.867656838838395,
+      "grad_norm": 0.09789609163999557,
+      "learning_rate": 6.588920716432329e-05,
+      "loss": 0.6422,
+      "step": 8425
+    },
+    {
+      "epoch": 1.8731988472622478,
+      "grad_norm": 0.134750634431839,
+      "learning_rate": 6.532739762653804e-05,
+      "loss": 0.7462,
+      "step": 8450
+    },
+    {
+      "epoch": 1.8787408556861007,
+      "grad_norm": 0.09457212686538696,
+      "learning_rate": 6.476682926637197e-05,
+      "loss": 0.6161,
+      "step": 8475
+    },
+    {
+      "epoch": 1.8842828641099536,
+      "grad_norm": 0.18783515691757202,
+      "learning_rate": 6.420752215053065e-05,
+      "loss": 0.7236,
+      "step": 8500
+    },
+    {
+      "epoch": 1.8898248725338063,
+      "grad_norm": 0.09566831588745117,
+      "learning_rate": 6.364949630057078e-05,
+      "loss": 0.5882,
+      "step": 8525
+    },
+    {
+      "epoch": 1.895366880957659,
+      "grad_norm": 0.12137682735919952,
+      "learning_rate": 6.30927716921835e-05,
+      "loss": 0.7284,
+      "step": 8550
+    },
+    {
+      "epoch": 1.9009088893815118,
+      "grad_norm": 0.09804505854845047,
+      "learning_rate": 6.25373682544793e-05,
+      "loss": 0.6165,
+      "step": 8575
+    },
+    {
+      "epoch": 1.9064508978053647,
+      "grad_norm": 0.20505575835704803,
+      "learning_rate": 6.198330586927463e-05,
+      "loss": 0.7224,
+      "step": 8600
+    },
+    {
+      "epoch": 1.9119929062292176,
+      "grad_norm": 0.10258720070123672,
+      "learning_rate": 6.14306043703802e-05,
+      "loss": 0.5683,
+      "step": 8625
+    },
+    {
+      "epoch": 1.9175349146530702,
+      "grad_norm": 0.14654862880706787,
+      "learning_rate": 6.087928354289103e-05,
+      "loss": 0.7336,
+      "step": 8650
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.09862499684095383,
+      "learning_rate": 6.0329363122478e-05,
+      "loss": 0.5824,
+      "step": 8675
+    },
+    {
+      "epoch": 1.9286189315007758,
+      "grad_norm": 0.135267972946167,
+      "learning_rate": 5.978086279468163e-05,
+      "loss": 0.744,
+      "step": 8700
+    },
+    {
+      "epoch": 1.9341609399246287,
+      "grad_norm": 0.1033608540892601,
+      "learning_rate": 5.923380219420729e-05,
+      "loss": 0.6134,
+      "step": 8725
+    },
+    {
+      "epoch": 1.9397029483484816,
+      "grad_norm": 0.1315843164920807,
+      "learning_rate": 5.8688200904222266e-05,
+      "loss": 0.7151,
+      "step": 8750
+    },
+    {
+      "epoch": 1.9452449567723344,
+      "grad_norm": 0.09951479732990265,
+      "learning_rate": 5.8144078455654846e-05,
+      "loss": 0.622,
+      "step": 8775
+    },
+    {
+      "epoch": 1.950786965196187,
+      "grad_norm": 0.2037511169910431,
+      "learning_rate": 5.760145432649515e-05,
+      "loss": 0.7239,
+      "step": 8800
+    },
+    {
+      "epoch": 1.9563289736200398,
+      "grad_norm": 0.10025127977132797,
+      "learning_rate": 5.706034794109778e-05,
+      "loss": 0.5922,
+      "step": 8825
+    },
+    {
+      "epoch": 1.9618709820438927,
+      "grad_norm": 0.18914011120796204,
+      "learning_rate": 5.65207786694866e-05,
+      "loss": 0.7205,
+      "step": 8850
+    },
+    {
+      "epoch": 1.9674129904677455,
+      "grad_norm": 0.10025196522474289,
+      "learning_rate": 5.5982765826661256e-05,
+      "loss": 0.5814,
+      "step": 8875
+    },
+    {
+      "epoch": 1.9729549988915984,
+      "grad_norm": 0.12357232719659805,
+      "learning_rate": 5.544632867190591e-05,
+      "loss": 0.7217,
+      "step": 8900
+    },
+    {
+      "epoch": 1.978497007315451,
+      "grad_norm": 0.09909965842962265,
+      "learning_rate": 5.491148640809962e-05,
+      "loss": 0.6102,
+      "step": 8925
+    },
+    {
+      "epoch": 1.984039015739304,
+      "grad_norm": 0.16131410002708435,
+      "learning_rate": 5.437825818102902e-05,
+      "loss": 0.7193,
+      "step": 8950
+    },
+    {
+      "epoch": 1.9895810241631566,
+      "grad_norm": 0.10195192694664001,
+      "learning_rate": 5.384666307870293e-05,
+      "loss": 0.5989,
+      "step": 8975
+    },
+    {
+      "epoch": 1.9951230325870095,
+      "grad_norm": 0.1269746869802475,
+      "learning_rate": 5.331672013066922e-05,
+      "loss": 0.7287,
+      "step": 9000
+    },
+    {
+      "epoch": 2.0006650410108624,
+      "grad_norm": 0.09312586486339569,
+      "learning_rate": 5.278844830733332e-05,
+      "loss": 0.6024,
+      "step": 9025
+    },
+    {
+      "epoch": 2.0062070494347153,
+      "grad_norm": 0.10967884957790375,
+      "learning_rate": 5.226186651927938e-05,
+      "loss": 0.7053,
+      "step": 9050
+    },
+    {
+      "epoch": 2.0117490578585677,
+      "grad_norm": 0.09102078527212143,
+      "learning_rate": 5.1736993616593165e-05,
+      "loss": 0.5861,
+      "step": 9075
+    },
+    {
+      "epoch": 2.0172910662824206,
+      "grad_norm": 0.10821503400802612,
+      "learning_rate": 5.121384838818746e-05,
+      "loss": 0.6865,
+      "step": 9100
+    },
+    {
+      "epoch": 2.0228330747062735,
+      "grad_norm": 0.09317923337221146,
+      "learning_rate": 5.0692449561129285e-05,
+      "loss": 0.5912,
+      "step": 9125
+    },
+    {
+      "epoch": 2.0283750831301264,
+      "grad_norm": 0.11409013718366623,
+      "learning_rate": 5.017281579996961e-05,
+      "loss": 0.6979,
+      "step": 9150
+    },
+    {
+      "epoch": 2.0339170915539793,
+      "grad_norm": 0.09311998635530472,
+      "learning_rate": 4.965496570607523e-05,
+      "loss": 0.6235,
+      "step": 9175
+    },
+    {
+      "epoch": 2.039459099977832,
+      "grad_norm": 0.12173454463481903,
+      "learning_rate": 4.913891781696285e-05,
+      "loss": 0.7002,
+      "step": 9200
+    },
+    {
+      "epoch": 2.0450011084016846,
+      "grad_norm": 0.10086531937122345,
+      "learning_rate": 4.8624690605635626e-05,
+      "loss": 0.5569,
+      "step": 9225
+    },
+    {
+      "epoch": 2.0505431168255375,
+      "grad_norm": 0.10775309801101685,
+      "learning_rate": 4.811230247992165e-05,
+      "loss": 0.6925,
+      "step": 9250
+    },
+    {
+      "epoch": 2.0560851252493904,
+      "grad_norm": 0.096441850066185,
+      "learning_rate": 4.760177178181521e-05,
+      "loss": 0.5906,
+      "step": 9275
+    },
+    {
+      "epoch": 2.0616271336732432,
+      "grad_norm": 0.11258859932422638,
+      "learning_rate": 4.709311678682005e-05,
+      "loss": 0.6883,
+      "step": 9300
+    },
+    {
+      "epoch": 2.067169142097096,
+      "grad_norm": 0.0907958596944809,
+      "learning_rate": 4.658635570329537e-05,
+      "loss": 0.6069,
+      "step": 9325
+    },
+    {
+      "epoch": 2.0727111505209486,
+      "grad_norm": 0.11246030032634735,
+      "learning_rate": 4.608150667180378e-05,
+      "loss": 0.6951,
+      "step": 9350
+    },
+    {
+      "epoch": 2.0782531589448014,
+      "grad_norm": 0.09199715405702591,
+      "learning_rate": 4.557858776446203e-05,
+      "loss": 0.5949,
+      "step": 9375
+    },
+    {
+      "epoch": 2.0837951673686543,
+      "grad_norm": 0.11862944066524506,
+      "learning_rate": 4.50776169842941e-05,
+      "loss": 0.6701,
+      "step": 9400
+    },
+    {
+      "epoch": 2.089337175792507,
+      "grad_norm": 0.09631045907735825,
+      "learning_rate": 4.457861226458678e-05,
+      "loss": 0.6158,
+      "step": 9425
+    },
+    {
+      "epoch": 2.09487918421636,
+      "grad_norm": 0.107430100440979,
+      "learning_rate": 4.408159146824756e-05,
+      "loss": 0.6998,
+      "step": 9450
+    },
+    {
+      "epoch": 2.100421192640213,
+      "grad_norm": 0.0933394506573677,
+      "learning_rate": 4.358657238716533e-05,
+      "loss": 0.5998,
+      "step": 9475
+    },
+    {
+      "epoch": 2.1059632010640654,
+      "grad_norm": 0.11312496662139893,
+      "learning_rate": 4.309357274157338e-05,
+      "loss": 0.6909,
+      "step": 9500
+    },
+    {
+      "epoch": 2.1115052094879183,
+      "grad_norm": 0.09570565819740295,
+      "learning_rate": 4.260261017941526e-05,
+      "loss": 0.5805,
+      "step": 9525
+    },
+    {
+      "epoch": 2.117047217911771,
+      "grad_norm": 0.11327219754457474,
+      "learning_rate": 4.211370227571276e-05,
+      "loss": 0.6968,
+      "step": 9550
+    },
+    {
+      "epoch": 2.122589226335624,
+      "grad_norm": 0.09823332726955414,
+      "learning_rate": 4.162686653193698e-05,
+      "loss": 0.5965,
+      "step": 9575
+    },
+    {
+      "epoch": 2.128131234759477,
+      "grad_norm": 0.11287786811590195,
+      "learning_rate": 4.11421203753817e-05,
+      "loss": 0.6936,
+      "step": 9600
+    },
+    {
+      "epoch": 2.1336732431833294,
+      "grad_norm": 0.09058432281017303,
+      "learning_rate": 4.065948115853973e-05,
+      "loss": 0.61,
+      "step": 9625
+    },
+    {
+      "epoch": 2.1392152516071823,
+      "grad_norm": 0.11232877522706985,
+      "learning_rate": 4.017896615848149e-05,
+      "loss": 0.6852,
+      "step": 9650
+    },
+    {
+      "epoch": 2.144757260031035,
+      "grad_norm": 0.09588344395160675,
+      "learning_rate": 3.9700592576236686e-05,
+      "loss": 0.6083,
+      "step": 9675
+    },
+    {
+      "epoch": 2.150299268454888,
+      "grad_norm": 0.11739671975374222,
+      "learning_rate": 3.922437753617856e-05,
+      "loss": 0.6997,
+      "step": 9700
+    },
+    {
+      "epoch": 2.155841276878741,
+      "grad_norm": 0.09523261338472366,
+      "learning_rate": 3.875033808541083e-05,
+      "loss": 0.5693,
+      "step": 9725
+    },
+    {
+      "epoch": 2.161383285302594,
+      "grad_norm": 0.11812377721071243,
+      "learning_rate": 3.827849119315755e-05,
+      "loss": 0.6907,
+      "step": 9750
+    },
+    {
+      "epoch": 2.1669252937264463,
+      "grad_norm": 0.09816546738147736,
+      "learning_rate": 3.780885375015549e-05,
+      "loss": 0.5891,
+      "step": 9775
+    },
+    {
+      "epoch": 2.172467302150299,
+      "grad_norm": 0.11008067429065704,
+      "learning_rate": 3.734144256804978e-05,
+      "loss": 0.691,
+      "step": 9800
+    },
+    {
+      "epoch": 2.178009310574152,
+      "grad_norm": 0.09762485325336456,
+      "learning_rate": 3.687627437879177e-05,
+      "loss": 0.5914,
+      "step": 9825
+    },
+    {
+      "epoch": 2.183551318998005,
+      "grad_norm": 0.11886761337518692,
+      "learning_rate": 3.6413365834040326e-05,
+      "loss": 0.6896,
+      "step": 9850
+    },
+    {
+      "epoch": 2.189093327421858,
+      "grad_norm": 0.09332608431577682,
+      "learning_rate": 3.595273350456557e-05,
+      "loss": 0.5844,
+      "step": 9875
+    },
+    {
+      "epoch": 2.1946353358457107,
+      "grad_norm": 0.1171206533908844,
+      "learning_rate": 3.549439387965592e-05,
+      "loss": 0.7006,
+      "step": 9900
+    },
+    {
+      "epoch": 2.200177344269563,
+      "grad_norm": 0.09177059680223465,
+      "learning_rate": 3.503836336652756e-05,
+      "loss": 0.589,
+      "step": 9925
+    },
+    {
+      "epoch": 2.205719352693416,
+      "grad_norm": 0.11849093437194824,
+      "learning_rate": 3.4584658289737296e-05,
+      "loss": 0.687,
+      "step": 9950
+    },
+    {
+      "epoch": 2.211261361117269,
+      "grad_norm": 0.09985481947660446,
+      "learning_rate": 3.4133294890598065e-05,
+      "loss": 0.6102,
+      "step": 9975
+    },
+    {
+      "epoch": 2.216803369541122,
+      "grad_norm": 0.11642364412546158,
+      "learning_rate": 3.3684289326597726e-05,
+      "loss": 0.6963,
+      "step": 10000
+    },
+    {
+      "epoch": 2.2223453779649747,
+      "grad_norm": 0.08828485757112503,
+      "learning_rate": 3.323765767082042e-05,
+      "loss": 0.5929,
+      "step": 10025
+    },
+    {
+      "epoch": 2.227887386388827,
+      "grad_norm": 0.11624643206596375,
+      "learning_rate": 3.2793415911371386e-05,
+      "loss": 0.7003,
+      "step": 10050
+    },
+    {
+      "epoch": 2.23342939481268,
+      "grad_norm": 0.09697126597166061,
+      "learning_rate": 3.235157995080451e-05,
+      "loss": 0.5912,
+      "step": 10075
+    },
+    {
+      "epoch": 2.238971403236533,
+      "grad_norm": 0.11310283094644547,
+      "learning_rate": 3.191216560555326e-05,
+      "loss": 0.672,
+      "step": 10100
+    },
+    {
+      "epoch": 2.2445134116603858,
+      "grad_norm": 0.09560606628656387,
+      "learning_rate": 3.147518860536422e-05,
+      "loss": 0.5817,
+      "step": 10125
+    },
+    {
+      "epoch": 2.2500554200842386,
+      "grad_norm": 0.12067476660013199,
+      "learning_rate": 3.105799826047285e-05,
+      "loss": 0.6943,
+      "step": 10150
+    },
+    {
+      "epoch": 2.255597428508091,
+      "grad_norm": 0.09630808234214783,
+      "learning_rate": 3.062584375085582e-05,
+      "loss": 0.5833,
+      "step": 10175
+    },
+    {
+      "epoch": 2.261139436931944,
+      "grad_norm": 0.11377058923244476,
+      "learning_rate": 3.0196172632856158e-05,
+      "loss": 0.6931,
+      "step": 10200
+    },
+    {
+      "epoch": 2.266681445355797,
+      "grad_norm": 0.09476067125797272,
+      "learning_rate": 2.9769000287441484e-05,
+      "loss": 0.5616,
+      "step": 10225
+    },
+    {
+      "epoch": 2.2722234537796497,
+      "grad_norm": 0.11672156304121017,
+      "learning_rate": 2.9344342006130754e-05,
+      "loss": 0.684,
+      "step": 10250
+    },
+    {
+      "epoch": 2.2777654622035026,
+      "grad_norm": 0.09658654034137726,
+      "learning_rate": 2.8922212990446716e-05,
+      "loss": 0.5865,
+      "step": 10275
+    },
+    {
+      "epoch": 2.2833074706273555,
+      "grad_norm": 0.11913245916366577,
+      "learning_rate": 2.8502628351371842e-05,
+      "loss": 0.6895,
+      "step": 10300
+    },
+    {
+      "epoch": 2.2888494790512084,
+      "grad_norm": 0.0930788666009903,
+      "learning_rate": 2.8102234790536696e-05,
+      "loss": 0.6014,
+      "step": 10325
+    },
+    {
+      "epoch": 2.294391487475061,
+      "grad_norm": 0.12074844539165497,
+      "learning_rate": 2.7687680614304466e-05,
+      "loss": 0.6919,
+      "step": 10350
+    },
+    {
+      "epoch": 2.2999334958989137,
+      "grad_norm": 0.0964696928858757,
+      "learning_rate": 2.72757150073249e-05,
+      "loss": 0.5843,
+      "step": 10375
+    },
+    {
+      "epoch": 2.3054755043227666,
+      "grad_norm": 0.11635053157806396,
+      "learning_rate": 2.6866352716760167e-05,
+      "loss": 0.697,
+      "step": 10400
+    },
+    {
+      "epoch": 2.3110175127466195,
+      "grad_norm": 0.09311337023973465,
+      "learning_rate": 2.6459608396581404e-05,
+      "loss": 0.5885,
+      "step": 10425
+    },
+    {
+      "epoch": 2.3165595211704724,
+      "grad_norm": 0.11619989573955536,
+      "learning_rate": 2.6055496607044018e-05,
+      "loss": 0.6825,
+      "step": 10450
+    },
+    {
+      "epoch": 2.322101529594325,
+      "grad_norm": 0.09799373149871826,
+      "learning_rate": 2.5654031814166524e-05,
+      "loss": 0.6016,
+      "step": 10475
+    },
+    {
+      "epoch": 2.3276435380181777,
+      "grad_norm": 0.11626887321472168,
+      "learning_rate": 2.5255228389212803e-05,
+      "loss": 0.6933,
+      "step": 10500
+    },
+    {
+      "epoch": 2.3331855464420306,
+      "grad_norm": 0.09651979058980942,
+      "learning_rate": 2.4859100608177456e-05,
+      "loss": 0.595,
+      "step": 10525
+    },
+    {
+      "epoch": 2.3387275548658835,
+      "grad_norm": 0.1168813407421112,
+      "learning_rate": 2.44656626512749e-05,
+      "loss": 0.6909,
+      "step": 10550
+    },
+    {
+      "epoch": 2.3442695632897363,
+      "grad_norm": 0.09738708287477493,
+      "learning_rate": 2.407492860243171e-05,
+      "loss": 0.5848,
+      "step": 10575
+    },
+    {
+      "epoch": 2.3498115717135892,
+      "grad_norm": 0.12800638377666473,
+      "learning_rate": 2.368691244878254e-05,
+      "loss": 0.6875,
+      "step": 10600
+    },
+    {
+      "epoch": 2.3553535801374417,
+      "grad_norm": 0.09715036302804947,
+      "learning_rate": 2.330162808016928e-05,
+      "loss": 0.5575,
+      "step": 10625
+    },
+    {
+      "epoch": 2.3608955885612946,
+      "grad_norm": 0.11949202418327332,
+      "learning_rate": 2.2919089288643948e-05,
+      "loss": 0.6873,
+      "step": 10650
+    },
+    {
+      "epoch": 2.3664375969851474,
+      "grad_norm": 0.10298886895179749,
+      "learning_rate": 2.253930976797489e-05,
+      "loss": 0.5973,
+      "step": 10675
+    },
+    {
+      "epoch": 2.3719796054090003,
+      "grad_norm": 0.1202768087387085,
+      "learning_rate": 2.216230311315677e-05,
+      "loss": 0.6889,
+      "step": 10700
+    },
+    {
+      "epoch": 2.377521613832853,
+      "grad_norm": 0.10042477399110794,
+      "learning_rate": 2.1788082819923637e-05,
+      "loss": 0.5959,
+      "step": 10725
+    },
+    {
+      "epoch": 2.3830636222567056,
+      "grad_norm": 0.11488287150859833,
+      "learning_rate": 2.141666228426602e-05,
+      "loss": 0.6842,
+      "step": 10750
+    },
+    {
+      "epoch": 2.3886056306805585,
+      "grad_norm": 0.10014986246824265,
+      "learning_rate": 2.104805480195128e-05,
+      "loss": 0.602,
+      "step": 10775
+    },
+    {
+      "epoch": 2.3941476391044114,
+      "grad_norm": 0.11838942021131516,
+      "learning_rate": 2.0682273568047806e-05,
+      "loss": 0.678,
+      "step": 10800
+    },
+    {
+      "epoch": 2.3996896475282643,
+      "grad_norm": 0.09891260415315628,
+      "learning_rate": 2.031933167645248e-05,
+      "loss": 0.6204,
+      "step": 10825
+    },
+    {
+      "epoch": 2.405231655952117,
+      "grad_norm": 0.11890105158090591,
+      "learning_rate": 1.9959242119422062e-05,
+      "loss": 0.6933,
+      "step": 10850
+    },
+    {
+      "epoch": 2.41077366437597,
+      "grad_norm": 0.10138825327157974,
+      "learning_rate": 1.9602017787108073e-05,
+      "loss": 0.5633,
+      "step": 10875
+    },
+    {
+      "epoch": 2.4163156727998225,
+      "grad_norm": 0.12140816450119019,
+      "learning_rate": 1.9247671467095464e-05,
+      "loss": 0.6962,
+      "step": 10900
+    },
+    {
+      "epoch": 2.4218576812236754,
+      "grad_norm": 0.09580956399440765,
+      "learning_rate": 1.8896215843944687e-05,
+      "loss": 0.5756,
+      "step": 10925
+    },
+    {
+      "epoch": 2.4273996896475283,
+      "grad_norm": 0.11642120033502579,
+      "learning_rate": 1.8547663498737744e-05,
+      "loss": 0.682,
+      "step": 10950
+    },
+    {
+      "epoch": 2.432941698071381,
+      "grad_norm": 0.09648188948631287,
+      "learning_rate": 1.8202026908627757e-05,
+      "loss": 0.6105,
+      "step": 10975
+    },
+    {
+      "epoch": 2.438483706495234,
+      "grad_norm": 0.11960633844137192,
+      "learning_rate": 1.7859318446392438e-05,
+      "loss": 0.6856,
+      "step": 11000
+    },
+    {
+      "epoch": 2.4440257149190865,
+      "grad_norm": 0.10111569613218307,
+      "learning_rate": 1.7519550379991022e-05,
+      "loss": 0.5844,
+      "step": 11025
+    },
+    {
+      "epoch": 2.4495677233429394,
+      "grad_norm": 0.11525440216064453,
+      "learning_rate": 1.7182734872125194e-05,
+      "loss": 0.6851,
+      "step": 11050
+    },
+    {
+      "epoch": 2.4551097317667923,
+      "grad_norm": 0.10440149903297424,
+      "learning_rate": 1.68488839798037e-05,
+      "loss": 0.5659,
+      "step": 11075
+    },
+    {
+      "epoch": 2.460651740190645,
+      "grad_norm": 0.11208430677652359,
+      "learning_rate": 1.651800965391076e-05,
+      "loss": 0.6895,
+      "step": 11100
+    },
+    {
+      "epoch": 2.466193748614498,
+      "grad_norm": 0.09721311926841736,
+      "learning_rate": 1.6190123738778195e-05,
+      "loss": 0.5975,
+      "step": 11125
+    },
+    {
+      "epoch": 2.471735757038351,
+      "grad_norm": 0.12069053202867508,
+      "learning_rate": 1.5865237971761483e-05,
+      "loss": 0.6999,
+      "step": 11150
+    },
+    {
+      "epoch": 2.4772777654622034,
+      "grad_norm": 0.09901247173547745,
+      "learning_rate": 1.5543363982819593e-05,
+      "loss": 0.5735,
+      "step": 11175
+    },
+    {
+      "epoch": 2.4828197738860562,
+      "grad_norm": 0.12278547137975693,
+      "learning_rate": 1.522451329409863e-05,
+      "loss": 0.6817,
+      "step": 11200
+    },
+    {
+      "epoch": 2.488361782309909,
+      "grad_norm": 0.09708017110824585,
+      "learning_rate": 1.4908697319519471e-05,
+      "loss": 0.587,
+      "step": 11225
+    },
+    {
+      "epoch": 2.493903790733762,
+      "grad_norm": 0.11575840413570404,
+      "learning_rate": 1.4595927364369067e-05,
+      "loss": 0.6873,
+      "step": 11250
+    },
+    {
+      "epoch": 2.499445799157615,
+      "grad_norm": 0.09546317160129547,
+      "learning_rate": 1.428621462489579e-05,
+      "loss": 0.5687,
+      "step": 11275
+    },
+    {
+      "epoch": 2.5049878075814673,
+      "grad_norm": 0.1179046705365181,
+      "learning_rate": 1.3979570187908642e-05,
+      "loss": 0.6869,
+      "step": 11300
+    },
+    {
+      "epoch": 2.51052981600532,
+      "grad_norm": 0.09939169883728027,
+      "learning_rate": 1.3676005030380434e-05,
+      "loss": 0.5748,
+      "step": 11325
+    },
+    {
+      "epoch": 2.516071824429173,
+      "grad_norm": 0.11648693680763245,
+      "learning_rate": 1.3375530019054727e-05,
+      "loss": 0.6955,
+      "step": 11350
+    },
+    {
+      "epoch": 2.521613832853026,
+      "grad_norm": 0.09718350321054459,
+      "learning_rate": 1.3078155910056933e-05,
+      "loss": 0.6082,
+      "step": 11375
+    },
+    {
+      "epoch": 2.527155841276879,
+      "grad_norm": 0.1167871356010437,
+      "learning_rate": 1.2783893348509201e-05,
+      "loss": 0.6818,
+      "step": 11400
+    },
+    {
+      "epoch": 2.5326978497007318,
+      "grad_norm": 0.09479407966136932,
+      "learning_rate": 1.2492752868149493e-05,
+      "loss": 0.5897,
+      "step": 11425
+    },
+    {
+      "epoch": 2.538239858124584,
+      "grad_norm": 0.11183246970176697,
+      "learning_rate": 1.2204744890954312e-05,
+      "loss": 0.677,
+      "step": 11450
+    },
+    {
+      "epoch": 2.543781866548437,
+      "grad_norm": 0.0982903316617012,
+      "learning_rate": 1.1919879726765704e-05,
+      "loss": 0.6022,
+      "step": 11475
+    },
+    {
+      "epoch": 2.54932387497229,
+      "grad_norm": 0.1157740131020546,
+      "learning_rate": 1.1638167572922331e-05,
+      "loss": 0.6883,
+      "step": 11500
+    },
+    {
+      "epoch": 2.554865883396143,
+      "grad_norm": 0.0996859073638916,
+      "learning_rate": 1.135961851389421e-05,
+      "loss": 0.5939,
+      "step": 11525
+    },
+    {
+      "epoch": 2.5604078918199957,
+      "grad_norm": 0.11946888267993927,
+      "learning_rate": 1.1084242520921884e-05,
+      "loss": 0.6865,
+      "step": 11550
+    },
+    {
+      "epoch": 2.565949900243848,
+      "grad_norm": 0.10295979678630829,
+      "learning_rate": 1.0812049451659367e-05,
+      "loss": 0.5887,
+      "step": 11575
+    },
+    {
+      "epoch": 2.571491908667701,
+      "grad_norm": 0.11419833451509476,
+      "learning_rate": 1.0543049049821429e-05,
+      "loss": 0.6946,
+      "step": 11600
+    },
+    {
+      "epoch": 2.577033917091554,
+      "grad_norm": 0.09568798542022705,
+      "learning_rate": 1.0277250944834594e-05,
+      "loss": 0.5311,
+      "step": 11625
+    },
+    {
+      "epoch": 2.582575925515407,
+      "grad_norm": 0.11659660935401917,
+      "learning_rate": 1.0014664651492588e-05,
+      "loss": 0.6942,
+      "step": 11650
+    },
+    {
+      "epoch": 2.5881179339392597,
+      "grad_norm": 0.09711961448192596,
+      "learning_rate": 9.755299569615661e-06,
+      "loss": 0.58,
+      "step": 11675
+    },
+    {
+      "epoch": 2.5936599423631126,
+      "grad_norm": 0.12282078713178635,
+      "learning_rate": 9.49916498371416e-06,
+      "loss": 0.6887,
+      "step": 11700
+    },
+    {
+      "epoch": 2.599201950786965,
+      "grad_norm": 0.09802839159965515,
+      "learning_rate": 9.246270062656104e-06,
+      "loss": 0.5524,
+      "step": 11725
+    },
+    {
+      "epoch": 2.604743959210818,
+      "grad_norm": 0.11624480038881302,
+      "learning_rate": 8.996623859339026e-06,
+      "loss": 0.6953,
+      "step": 11750
+    },
+    {
+      "epoch": 2.610285967634671,
+      "grad_norm": 0.09748586267232895,
+      "learning_rate": 8.75023531036584e-06,
+      "loss": 0.5844,
+      "step": 11775
+    },
+    {
+      "epoch": 2.6158279760585237,
+      "grad_norm": 0.12042013555765152,
+      "learning_rate": 8.50711323572504e-06,
+      "loss": 0.7066,
+      "step": 11800
+    },
+    {
+      "epoch": 2.6213699844823766,
+      "grad_norm": 0.09437406063079834,
+      "learning_rate": 8.267266338474833e-06,
+      "loss": 0.5993,
+      "step": 11825
+    },
+    {
+      "epoch": 2.626911992906229,
+      "grad_norm": 0.12176880985498428,
+      "learning_rate": 8.030703204431711e-06,
+      "loss": 0.6901,
+      "step": 11850
+    },
+    {
+      "epoch": 2.632454001330082,
+      "grad_norm": 0.09464902430772781,
+      "learning_rate": 7.797432301863029e-06,
+      "loss": 0.5777,
+      "step": 11875
+    },
+    {
+      "epoch": 2.637996009753935,
+      "grad_norm": 0.1198597401380539,
+      "learning_rate": 7.567461981183954e-06,
+      "loss": 0.6915,
+      "step": 11900
+    },
+    {
+      "epoch": 2.6435380181777877,
+      "grad_norm": 0.09965650737285614,
+      "learning_rate": 7.340800474658438e-06,
+      "loss": 0.6047,
+      "step": 11925
+    },
+    {
+      "epoch": 2.6490800266016405,
+      "grad_norm": 0.12205997854471207,
+      "learning_rate": 7.117455896104586e-06,
+      "loss": 0.6888,
+      "step": 11950
+    },
+    {
+      "epoch": 2.6546220350254934,
+      "grad_norm": 0.10341254621744156,
+      "learning_rate": 6.897436240604182e-06,
+      "loss": 0.5954,
+      "step": 11975
+    },
+    {
+      "epoch": 2.6601640434493463,
+      "grad_norm": 0.11666877567768097,
+      "learning_rate": 6.680749384216556e-06,
+      "loss": 0.6775,
+      "step": 12000
+    },
+    {
+      "epoch": 2.6657060518731988,
+      "grad_norm": 0.09655077755451202,
+      "learning_rate": 6.4674030836965435e-06,
+      "loss": 0.5951,
+      "step": 12025
+    },
+    {
+      "epoch": 2.6712480602970516,
+      "grad_norm": 0.11688444018363953,
+      "learning_rate": 6.257404976216863e-06,
+      "loss": 0.6762,
+      "step": 12050
+    },
+    {
+      "epoch": 2.6767900687209045,
+      "grad_norm": 0.09534773975610733,
+      "learning_rate": 6.050762579094727e-06,
+      "loss": 0.6063,
+      "step": 12075
+    },
+    {
+      "epoch": 2.6823320771447574,
+      "grad_norm": 0.11826001852750778,
+      "learning_rate": 5.847483289522804e-06,
+      "loss": 0.6893,
+      "step": 12100
+    },
+    {
+      "epoch": 2.68787408556861,
+      "grad_norm": 0.09770546853542328,
+      "learning_rate": 5.647574384304288e-06,
+      "loss": 0.5977,
+      "step": 12125
+    },
+    {
+      "epoch": 2.6934160939924627,
+      "grad_norm": 0.11583051085472107,
+      "learning_rate": 5.451043019592506e-06,
+      "loss": 0.6892,
+      "step": 12150
+    },
+    {
+      "epoch": 2.6989581024163156,
+      "grad_norm": 0.09545658528804779,
+      "learning_rate": 5.257896230634729e-06,
+      "loss": 0.5911,
+      "step": 12175
+    },
+    {
+      "epoch": 2.7045001108401685,
+      "grad_norm": 0.11835386604070663,
+      "learning_rate": 5.06814093152036e-06,
+      "loss": 0.6813,
+      "step": 12200
+    },
+    {
+      "epoch": 2.7100421192640214,
+      "grad_norm": 0.09876852482557297,
+      "learning_rate": 4.881783914933347e-06,
+      "loss": 0.5886,
+      "step": 12225
+    },
+    {
+      "epoch": 2.7155841276878743,
+      "grad_norm": 0.11607835441827774,
+      "learning_rate": 4.698831851909113e-06,
+      "loss": 0.6915,
+      "step": 12250
+    },
+    {
+      "epoch": 2.721126136111727,
+      "grad_norm": 0.09856165945529938,
+      "learning_rate": 4.519291291595673e-06,
+      "loss": 0.588,
+      "step": 12275
+    },
+    {
+      "epoch": 2.7266681445355796,
+      "grad_norm": 0.1222713515162468,
+      "learning_rate": 4.343168661019304e-06,
+      "loss": 0.7022,
+      "step": 12300
+    },
+    {
+      "epoch": 2.7322101529594325,
+      "grad_norm": 0.09814934432506561,
+      "learning_rate": 4.170470264854354e-06,
+      "loss": 0.5736,
+      "step": 12325
+    },
+    {
+      "epoch": 2.7377521613832854,
+      "grad_norm": 0.12004557251930237,
+      "learning_rate": 4.001202285197614e-06,
+      "loss": 0.6877,
+      "step": 12350
+    },
+    {
+      "epoch": 2.7432941698071382,
+      "grad_norm": 0.09881303459405899,
+      "learning_rate": 3.8353707813470255e-06,
+      "loss": 0.5771,
+      "step": 12375
+    },
+    {
+      "epoch": 2.7488361782309907,
+      "grad_norm": 0.11910282075405121,
+      "learning_rate": 3.6729816895847646e-06,
+      "loss": 0.6911,
+      "step": 12400
+    },
+    {
+      "epoch": 2.7543781866548436,
+      "grad_norm": 0.09742295742034912,
+      "learning_rate": 3.514040822964715e-06,
+      "loss": 0.6007,
+      "step": 12425
+    },
+    {
+      "epoch": 2.7599201950786965,
+      "grad_norm": 0.12169786542654037,
+      "learning_rate": 3.3585538711044197e-06,
+      "loss": 0.6907,
+      "step": 12450
+    },
+    {
+      "epoch": 2.7654622035025493,
+      "grad_norm": 0.09972374886274338,
+      "learning_rate": 3.206526399981358e-06,
+      "loss": 0.5764,
+      "step": 12475
+    },
+    {
+      "epoch": 2.7710042119264022,
+      "grad_norm": 0.11850964277982712,
+      "learning_rate": 3.057963851733803e-06,
+      "loss": 0.6742,
+      "step": 12500
+    },
+    {
+      "epoch": 2.776546220350255,
+      "grad_norm": 0.10190987586975098,
+      "learning_rate": 2.912871544465834e-06,
+      "loss": 0.5769,
+      "step": 12525
+    },
+    {
+      "epoch": 2.782088228774108,
+      "grad_norm": 0.11625930666923523,
+      "learning_rate": 2.771254672057144e-06,
+      "loss": 0.6827,
+      "step": 12550
+    },
+    {
+      "epoch": 2.7876302371979604,
+      "grad_norm": 0.10201703011989594,
+      "learning_rate": 2.6331183039769892e-06,
+      "loss": 0.5792,
+      "step": 12575
+    },
+    {
+      "epoch": 2.7931722456218133,
+      "grad_norm": 0.1190221905708313,
+      "learning_rate": 2.4984673851028095e-06,
+      "loss": 0.6815,
+      "step": 12600
+    },
+    {
+      "epoch": 2.798714254045666,
+      "grad_norm": 0.1018979623913765,
+      "learning_rate": 2.367306735543151e-06,
+      "loss": 0.5642,
+      "step": 12625
+    },
+    {
+      "epoch": 2.804256262469519,
+      "grad_norm": 0.11812682449817657,
+      "learning_rate": 2.2396410504651246e-06,
+      "loss": 0.684,
+      "step": 12650
+    },
+    {
+      "epoch": 2.8097982708933715,
+      "grad_norm": 0.09736626595258713,
+      "learning_rate": 2.115474899926351e-06,
+      "loss": 0.5792,
+      "step": 12675
+    },
+    {
+      "epoch": 2.8153402793172244,
+      "grad_norm": 0.11478708684444427,
+      "learning_rate": 1.994812728711404e-06,
+      "loss": 0.6799,
+      "step": 12700
+    },
+    {
+      "epoch": 2.8208822877410773,
+      "grad_norm": 0.09593895822763443,
+      "learning_rate": 1.8822775987230811e-06,
+      "loss": 0.583,
+      "step": 12725
+    },
+    {
+      "epoch": 2.82642429616493,
+      "grad_norm": 0.12742695212364197,
+      "learning_rate": 1.7684956399847686e-06,
+      "loss": 0.6926,
+      "step": 12750
+    },
+    {
+      "epoch": 2.831966304588783,
+      "grad_norm": 0.09630496799945831,
+      "learning_rate": 1.6582300814115714e-06,
+      "loss": 0.5821,
+      "step": 12775
+    },
+    {
+      "epoch": 2.837508313012636,
+      "grad_norm": 0.11476560682058334,
+      "learning_rate": 1.5514848701875473e-06,
+      "loss": 0.6813,
+      "step": 12800
+    },
+    {
+      "epoch": 2.843050321436489,
+      "grad_norm": 0.09966927021741867,
+      "learning_rate": 1.4482638274786354e-06,
+      "loss": 0.581,
+      "step": 12825
+    },
+    {
+      "epoch": 2.8485923298603413,
+      "grad_norm": 0.12046293914318085,
+      "learning_rate": 1.348570648295866e-06,
+      "loss": 0.6785,
+      "step": 12850
+    },
+    {
+      "epoch": 2.854134338284194,
+      "grad_norm": 0.09309112280607224,
+      "learning_rate": 1.252408901363078e-06,
+      "loss": 0.5868,
+      "step": 12875
+    },
+    {
+      "epoch": 2.859676346708047,
+      "grad_norm": 0.13479410111904144,
+      "learning_rate": 1.1597820289891758e-06,
+      "loss": 0.6804,
+      "step": 12900
+    },
+    {
+      "epoch": 2.8652183551319,
+      "grad_norm": 0.10119752585887909,
+      "learning_rate": 1.0706933469449288e-06,
+      "loss": 0.6251,
+      "step": 12925
+    },
+    {
+      "epoch": 2.8707603635557524,
+      "grad_norm": 0.12458885461091995,
+      "learning_rate": 9.851460443442317e-07,
+      "loss": 0.6895,
+      "step": 12950
+    },
+    {
+      "epoch": 2.8763023719796053,
+      "grad_norm": 0.09733375906944275,
+      "learning_rate": 9.031431835299753e-07,
+      "loss": 0.5619,
+      "step": 12975
+    },
+    {
+      "epoch": 2.881844380403458,
+      "grad_norm": 0.12155549973249435,
+      "learning_rate": 8.246876999644104e-07,
+      "loss": 0.6861,
+      "step": 13000
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 13533,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.239618339309722e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-13000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a13672a0401c8d0a53efc112f53ebd65f36fa9003e87b1710879c58d881a1e1
+size 5051

checkpoint-5000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-13b-chat-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

checkpoint-5000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+  "bias": "lora_only",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.001,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-5000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d180b331707bf9e081c1c962dde68c522775724e3d268142d0146a7effc62426
+size 209736952

checkpoint-5000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e09ec34632c136b2763ae88c46570b2ac61d74bbf38e56b10bf22089ab85e48b
+size 419529285

checkpoint-5000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bb037a5c61b129a7d4af16a4c1484215dc381b39d1964c770ac2c0e53d0f462
+size 14575

checkpoint-5000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99bb8a615f620c4d1745253ce4c9bf1a0da431ee38b91bdc2f185c8450df5bcc
+size 627

checkpoint-5000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-5000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-5000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-5000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-5000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1433 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.108401684770561,
+  "eval_steps": 500,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005542008423852805,
+      "grad_norm": 0.07243233174085617,
+      "learning_rate": 1.2315270935960592e-05,
+      "loss": 1.4594,
+      "step": 25
+    },
+    {
+      "epoch": 0.01108401684770561,
+      "grad_norm": 0.40484485030174255,
+      "learning_rate": 2.4630541871921184e-05,
+      "loss": 2.2032,
+      "step": 50
+    },
+    {
+      "epoch": 0.016626025271558414,
+      "grad_norm": 0.06850667297840118,
+      "learning_rate": 3.694581280788178e-05,
+      "loss": 1.2931,
+      "step": 75
+    },
+    {
+      "epoch": 0.02216803369541122,
+      "grad_norm": 0.4395073354244232,
+      "learning_rate": 4.926108374384237e-05,
+      "loss": 1.5698,
+      "step": 100
+    },
+    {
+      "epoch": 0.027710042119264023,
+      "grad_norm": 0.077068030834198,
+      "learning_rate": 6.157635467980296e-05,
+      "loss": 1.0537,
+      "step": 125
+    },
+    {
+      "epoch": 0.03325205054311683,
+      "grad_norm": 0.3282291293144226,
+      "learning_rate": 7.389162561576355e-05,
+      "loss": 0.9749,
+      "step": 150
+    },
+    {
+      "epoch": 0.03879405896696963,
+      "grad_norm": 0.0593000203371048,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 0.9349,
+      "step": 175
+    },
+    {
+      "epoch": 0.04433606739082244,
+      "grad_norm": 0.25612473487854004,
+      "learning_rate": 9.852216748768474e-05,
+      "loss": 0.8974,
+      "step": 200
+    },
+    {
+      "epoch": 0.04987807581467524,
+      "grad_norm": 0.0757347121834755,
+      "learning_rate": 0.00011083743842364534,
+      "loss": 0.9081,
+      "step": 225
+    },
+    {
+      "epoch": 0.055420084238528046,
+      "grad_norm": 0.14145499467849731,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 0.8607,
+      "step": 250
+    },
+    {
+      "epoch": 0.06096209266238085,
+      "grad_norm": 0.07710155844688416,
+      "learning_rate": 0.00013546798029556652,
+      "loss": 0.8973,
+      "step": 275
+    },
+    {
+      "epoch": 0.06650410108623366,
+      "grad_norm": 0.14791467785835266,
+      "learning_rate": 0.0001477832512315271,
+      "loss": 0.7924,
+      "step": 300
+    },
+    {
+      "epoch": 0.07204610951008646,
+      "grad_norm": 0.07742594182491302,
+      "learning_rate": 0.00016009852216748767,
+      "loss": 0.8698,
+      "step": 325
+    },
+    {
+      "epoch": 0.07758811793393926,
+      "grad_norm": 0.14303487539291382,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.786,
+      "step": 350
+    },
+    {
+      "epoch": 0.08313012635779206,
+      "grad_norm": 0.0865108072757721,
+      "learning_rate": 0.00018472906403940888,
+      "loss": 0.8606,
+      "step": 375
+    },
+    {
+      "epoch": 0.08867213478164487,
+      "grad_norm": 0.7533164024353027,
+      "learning_rate": 0.00019704433497536947,
+      "loss": 0.807,
+      "step": 400
+    },
+    {
+      "epoch": 0.09421414320549767,
+      "grad_norm": 0.08325570821762085,
+      "learning_rate": 0.00019999896617927833,
+      "loss": 0.8635,
+      "step": 425
+    },
+    {
+      "epoch": 0.09975615162935048,
+      "grad_norm": 0.1043543666601181,
+      "learning_rate": 0.0001999944557842899,
+      "loss": 0.7825,
+      "step": 450
+    },
+    {
+      "epoch": 0.10529816005320328,
+      "grad_norm": 0.07949995994567871,
+      "learning_rate": 0.0001999863658806385,
+      "loss": 0.8379,
+      "step": 475
+    },
+    {
+      "epoch": 0.11084016847705609,
+      "grad_norm": 0.12020070850849152,
+      "learning_rate": 0.00019997469675791905,
+      "loss": 0.768,
+      "step": 500
+    },
+    {
+      "epoch": 0.11638217690090889,
+      "grad_norm": 0.0803595781326294,
+      "learning_rate": 0.00019995944883385196,
+      "loss": 0.8487,
+      "step": 525
+    },
+    {
+      "epoch": 0.1219241853247617,
+      "grad_norm": 0.11509452760219574,
+      "learning_rate": 0.0001999406226542682,
+      "loss": 0.7787,
+      "step": 550
+    },
+    {
+      "epoch": 0.1274661937486145,
+      "grad_norm": 0.07928384840488434,
+      "learning_rate": 0.00019991821889308987,
+      "loss": 0.8357,
+      "step": 575
+    },
+    {
+      "epoch": 0.1330082021724673,
+      "grad_norm": 0.09423446655273438,
+      "learning_rate": 0.00019989223835230606,
+      "loss": 0.7564,
+      "step": 600
+    },
+    {
+      "epoch": 0.1385502105963201,
+      "grad_norm": 0.0835939422249794,
+      "learning_rate": 0.000199862681961944,
+      "loss": 0.8568,
+      "step": 625
+    },
+    {
+      "epoch": 0.1440922190201729,
+      "grad_norm": 0.09292898327112198,
+      "learning_rate": 0.0001998295507800359,
+      "loss": 0.7612,
+      "step": 650
+    },
+    {
+      "epoch": 0.1496342274440257,
+      "grad_norm": 0.07704215496778488,
+      "learning_rate": 0.00019979284599258107,
+      "loss": 0.8263,
+      "step": 675
+    },
+    {
+      "epoch": 0.15517623586787851,
+      "grad_norm": 0.10980474948883057,
+      "learning_rate": 0.0001997525689135034,
+      "loss": 0.7677,
+      "step": 700
+    },
+    {
+      "epoch": 0.16071824429173132,
+      "grad_norm": 0.08016064018011093,
+      "learning_rate": 0.0001997087209846043,
+      "loss": 0.8344,
+      "step": 725
+    },
+    {
+      "epoch": 0.16626025271558412,
+      "grad_norm": 0.0950881615281105,
+      "learning_rate": 0.0001996613037755113,
+      "loss": 0.769,
+      "step": 750
+    },
+    {
+      "epoch": 0.17180226113943692,
+      "grad_norm": 0.07932984828948975,
+      "learning_rate": 0.00019961031898362152,
+      "loss": 0.8156,
+      "step": 775
+    },
+    {
+      "epoch": 0.17734426956328975,
+      "grad_norm": 0.09336528927087784,
+      "learning_rate": 0.00019955576843404128,
+      "loss": 0.7767,
+      "step": 800
+    },
+    {
+      "epoch": 0.18288627798714255,
+      "grad_norm": 0.08560346812009811,
+      "learning_rate": 0.00019949765407952042,
+      "loss": 0.8228,
+      "step": 825
+    },
+    {
+      "epoch": 0.18842828641099535,
+      "grad_norm": 0.08475169539451599,
+      "learning_rate": 0.00019943597800038267,
+      "loss": 0.7669,
+      "step": 850
+    },
+    {
+      "epoch": 0.19397029483484815,
+      "grad_norm": 0.09038034081459045,
+      "learning_rate": 0.00019937074240445105,
+      "loss": 0.8182,
+      "step": 875
+    },
+    {
+      "epoch": 0.19951230325870095,
+      "grad_norm": 0.09195873886346817,
+      "learning_rate": 0.0001993019496269688,
+      "loss": 0.7598,
+      "step": 900
+    },
+    {
+      "epoch": 0.20505431168255375,
+      "grad_norm": 0.08655796200037003,
+      "learning_rate": 0.0001992296021305159,
+      "loss": 0.8167,
+      "step": 925
+    },
+    {
+      "epoch": 0.21059632010640655,
+      "grad_norm": 0.08353498578071594,
+      "learning_rate": 0.00019915370250492084,
+      "loss": 0.7486,
+      "step": 950
+    },
+    {
+      "epoch": 0.21613832853025935,
+      "grad_norm": 0.09225723147392273,
+      "learning_rate": 0.0001990742534671679,
+      "loss": 0.8138,
+      "step": 975
+    },
+    {
+      "epoch": 0.22168033695411218,
+      "grad_norm": 0.12104763090610504,
+      "learning_rate": 0.00019899125786129997,
+      "loss": 0.7153,
+      "step": 1000
+    },
+    {
+      "epoch": 0.22722234537796498,
+      "grad_norm": 0.0815986767411232,
+      "learning_rate": 0.00019890471865831669,
+      "loss": 0.7983,
+      "step": 1025
+    },
+    {
+      "epoch": 0.23276435380181779,
+      "grad_norm": 0.08845670521259308,
+      "learning_rate": 0.00019881463895606805,
+      "loss": 0.7187,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2383063622256706,
+      "grad_norm": 0.0821809321641922,
+      "learning_rate": 0.00019872102197914359,
+      "loss": 0.804,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2438483706495234,
+      "grad_norm": 0.08711609989404678,
+      "learning_rate": 0.00019862387107875688,
+      "loss": 0.7795,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2493903790733762,
+      "grad_norm": 0.08517508953809738,
+      "learning_rate": 0.00019852318973262567,
+      "loss": 0.7937,
+      "step": 1125
+    },
+    {
+      "epoch": 0.254932387497229,
+      "grad_norm": 0.10830071568489075,
+      "learning_rate": 0.00019841898154484726,
+      "loss": 0.7458,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2604743959210818,
+      "grad_norm": 0.08541836589574814,
+      "learning_rate": 0.0001983112502457696,
+      "loss": 0.8131,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2660164043449346,
+      "grad_norm": 0.08794037252664566,
+      "learning_rate": 0.00019819999969185762,
+      "loss": 0.7577,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2715584127687874,
+      "grad_norm": 0.08078176528215408,
+      "learning_rate": 0.00019808523386555542,
+      "loss": 0.812,
+      "step": 1225
+    },
+    {
+      "epoch": 0.2771004211926402,
+      "grad_norm": 0.09263130277395248,
+      "learning_rate": 0.0001979669568751434,
+      "loss": 0.7582,
+      "step": 1250
+    },
+    {
+      "epoch": 0.282642429616493,
+      "grad_norm": 0.08198932558298111,
+      "learning_rate": 0.00019784517295459147,
+      "loss": 0.7958,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2881844380403458,
+      "grad_norm": 0.07858102023601532,
+      "learning_rate": 0.00019771988646340725,
+      "loss": 0.7744,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2937264464641986,
+      "grad_norm": 0.0851408839225769,
+      "learning_rate": 0.00019759110188648026,
+      "loss": 0.7913,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2992684548880514,
+      "grad_norm": 0.09252189099788666,
+      "learning_rate": 0.00019745882383392116,
+      "loss": 0.7675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.30481046331190426,
+      "grad_norm": 0.08306555449962616,
+      "learning_rate": 0.0001973230570408968,
+      "loss": 0.8059,
+      "step": 1375
+    },
+    {
+      "epoch": 0.31035247173575703,
+      "grad_norm": 0.0797729641199112,
+      "learning_rate": 0.0001971838063674608,
+      "loss": 0.7424,
+      "step": 1400
+    },
+    {
+      "epoch": 0.31589448015960986,
+      "grad_norm": 0.08266165107488632,
+      "learning_rate": 0.0001970410767983794,
+      "loss": 0.7847,
+      "step": 1425
+    },
+    {
+      "epoch": 0.32143648858346263,
+      "grad_norm": 0.09364205598831177,
+      "learning_rate": 0.00019689487344295322,
+      "loss": 0.6924,
+      "step": 1450
+    },
+    {
+      "epoch": 0.32697849700731546,
+      "grad_norm": 0.08461842685937881,
+      "learning_rate": 0.00019674520153483414,
+      "loss": 0.8007,
+      "step": 1475
+    },
+    {
+      "epoch": 0.33252050543116823,
+      "grad_norm": 0.0840207040309906,
+      "learning_rate": 0.00019659206643183813,
+      "loss": 0.7139,
+      "step": 1500
+    },
+    {
+      "epoch": 0.33806251385502106,
+      "grad_norm": 0.08344192802906036,
+      "learning_rate": 0.00019643547361575343,
+      "loss": 0.7982,
+      "step": 1525
+    },
+    {
+      "epoch": 0.34360452227887384,
+      "grad_norm": 0.07934779673814774,
+      "learning_rate": 0.0001962754286921442,
+      "loss": 0.7164,
+      "step": 1550
+    },
+    {
+      "epoch": 0.34914653070272667,
+      "grad_norm": 0.08716201782226562,
+      "learning_rate": 0.00019611193739015,
+      "loss": 0.7846,
+      "step": 1575
+    },
+    {
+      "epoch": 0.3546885391265795,
+      "grad_norm": 0.08384064584970474,
+      "learning_rate": 0.0001959450055622806,
+      "loss": 0.7416,
+      "step": 1600
+    },
+    {
+      "epoch": 0.36023054755043227,
+      "grad_norm": 0.08661937713623047,
+      "learning_rate": 0.0001957746391842066,
+      "loss": 0.8075,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3657725559742851,
+      "grad_norm": 0.09327207505702972,
+      "learning_rate": 0.00019560084435454536,
+      "loss": 0.7596,
+      "step": 1650
+    },
+    {
+      "epoch": 0.37131456439813787,
+      "grad_norm": 0.08391096442937851,
+      "learning_rate": 0.00019542362729464273,
+      "loss": 0.7794,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3768565728219907,
+      "grad_norm": 0.07694080471992493,
+      "learning_rate": 0.00019524299434835052,
+      "loss": 0.7424,
+      "step": 1700
+    },
+    {
+      "epoch": 0.38239858124584347,
+      "grad_norm": 0.08567491173744202,
+      "learning_rate": 0.00019505895198179912,
+      "loss": 0.7996,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3879405896696963,
+      "grad_norm": 0.08828684687614441,
+      "learning_rate": 0.0001948715067831663,
+      "loss": 0.7394,
+      "step": 1750
+    },
+    {
+      "epoch": 0.39348259809354913,
+      "grad_norm": 0.08347714692354202,
+      "learning_rate": 0.00019468066546244117,
+      "loss": 0.7734,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3990246065174019,
+      "grad_norm": 0.07736373692750931,
+      "learning_rate": 0.00019448643485118412,
+      "loss": 0.7134,
+      "step": 1800
+    },
+    {
+      "epoch": 0.40456661494125473,
+      "grad_norm": 0.0840897262096405,
+      "learning_rate": 0.00019428882190228216,
+      "loss": 0.787,
+      "step": 1825
+    },
+    {
+      "epoch": 0.4101086233651075,
+      "grad_norm": 0.08665871620178223,
+      "learning_rate": 0.0001940878336897001,
+      "loss": 0.7151,
+      "step": 1850
+    },
+    {
+      "epoch": 0.41565063178896033,
+      "grad_norm": 0.08358912914991379,
+      "learning_rate": 0.0001938834774082274,
+      "loss": 0.7982,
+      "step": 1875
+    },
+    {
+      "epoch": 0.4211926402128131,
+      "grad_norm": 0.07928963005542755,
+      "learning_rate": 0.0001936757603732203,
+      "loss": 0.7195,
+      "step": 1900
+    },
+    {
+      "epoch": 0.42673464863666594,
+      "grad_norm": 0.08886470645666122,
+      "learning_rate": 0.00019346469002034042,
+      "loss": 0.7762,
+      "step": 1925
+    },
+    {
+      "epoch": 0.4322766570605187,
+      "grad_norm": 0.1071886494755745,
+      "learning_rate": 0.00019325027390528822,
+      "loss": 0.7453,
+      "step": 1950
+    },
+    {
+      "epoch": 0.43781866548437154,
+      "grad_norm": 0.08474262803792953,
+      "learning_rate": 0.00019303251970353261,
+      "loss": 0.7839,
+      "step": 1975
+    },
+    {
+      "epoch": 0.44336067390822437,
+      "grad_norm": 0.08803894370794296,
+      "learning_rate": 0.0001928114352100363,
+      "loss": 0.7171,
+      "step": 2000
+    },
+    {
+      "epoch": 0.44890268233207714,
+      "grad_norm": 0.08429575711488724,
+      "learning_rate": 0.00019258702833897665,
+      "loss": 0.7781,
+      "step": 2025
+    },
+    {
+      "epoch": 0.45444469075592997,
+      "grad_norm": 0.08510231226682663,
+      "learning_rate": 0.00019235930712346248,
+      "loss": 0.6949,
+      "step": 2050
+    },
+    {
+      "epoch": 0.45998669917978274,
+      "grad_norm": 0.08167176693677902,
+      "learning_rate": 0.00019212827971524634,
+      "loss": 0.7722,
+      "step": 2075
+    },
+    {
+      "epoch": 0.46552870760363557,
+      "grad_norm": 0.06542418897151947,
+      "learning_rate": 0.00019189395438443278,
+      "loss": 0.7203,
+      "step": 2100
+    },
+    {
+      "epoch": 0.47107071602748835,
+      "grad_norm": 0.08293402194976807,
+      "learning_rate": 0.00019165633951918247,
+      "loss": 0.7735,
+      "step": 2125
+    },
+    {
+      "epoch": 0.4766127244513412,
+      "grad_norm": 0.0809284895658493,
+      "learning_rate": 0.00019141544362541162,
+      "loss": 0.7412,
+      "step": 2150
+    },
+    {
+      "epoch": 0.48215473287519395,
+      "grad_norm": 0.08212891221046448,
+      "learning_rate": 0.00019117127532648773,
+      "loss": 0.7629,
+      "step": 2175
+    },
+    {
+      "epoch": 0.4876967412990468,
+      "grad_norm": 0.08602219074964523,
+      "learning_rate": 0.0001909238433629208,
+      "loss": 0.6935,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4932387497228996,
+      "grad_norm": 0.08529417216777802,
+      "learning_rate": 0.0001906731565920505,
+      "loss": 0.7915,
+      "step": 2225
+    },
+    {
+      "epoch": 0.4987807581467524,
+      "grad_norm": 0.08774964511394501,
+      "learning_rate": 0.00019041922398772897,
+      "loss": 0.7359,
+      "step": 2250
+    },
+    {
+      "epoch": 0.5043227665706052,
+      "grad_norm": 0.08649475872516632,
+      "learning_rate": 0.00019016205463999984,
+      "loss": 0.7696,
+      "step": 2275
+    },
+    {
+      "epoch": 0.509864774994458,
+      "grad_norm": 0.0878506749868393,
+      "learning_rate": 0.00018990165775477252,
+      "loss": 0.7365,
+      "step": 2300
+    },
+    {
+      "epoch": 0.5154067834183108,
+      "grad_norm": 0.09131711721420288,
+      "learning_rate": 0.0001896380426534929,
+      "loss": 0.7809,
+      "step": 2325
+    },
+    {
+      "epoch": 0.5209487918421636,
+      "grad_norm": 0.07379825413227081,
+      "learning_rate": 0.00018937121877280957,
+      "loss": 0.7029,
+      "step": 2350
+    },
+    {
+      "epoch": 0.5264908002660164,
+      "grad_norm": 0.08535836637020111,
+      "learning_rate": 0.00018910119566423598,
+      "loss": 0.7679,
+      "step": 2375
+    },
+    {
+      "epoch": 0.5320328086898692,
+      "grad_norm": 0.06719771772623062,
+      "learning_rate": 0.00018882798299380864,
+      "loss": 0.7121,
+      "step": 2400
+    },
+    {
+      "epoch": 0.537574817113722,
+      "grad_norm": 0.09019796550273895,
+      "learning_rate": 0.00018855159054174093,
+      "loss": 0.7754,
+      "step": 2425
+    },
+    {
+      "epoch": 0.5431168255375748,
+      "grad_norm": 0.08144286274909973,
+      "learning_rate": 0.0001882720282020732,
+      "loss": 0.7255,
+      "step": 2450
+    },
+    {
+      "epoch": 0.5486588339614277,
+      "grad_norm": 0.08412271738052368,
+      "learning_rate": 0.0001879893059823185,
+      "loss": 0.7722,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5542008423852804,
+      "grad_norm": 0.09016039222478867,
+      "learning_rate": 0.0001877034340031042,
+      "loss": 0.7275,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5597428508091332,
+      "grad_norm": 0.08850298821926117,
+      "learning_rate": 0.00018741442249781,
+      "loss": 0.7828,
+      "step": 2525
+    },
+    {
+      "epoch": 0.565284859232986,
+      "grad_norm": 0.06989564746618271,
+      "learning_rate": 0.00018712228181220128,
+      "loss": 0.7111,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5708268676568389,
+      "grad_norm": 0.09214618802070618,
+      "learning_rate": 0.00018682702240405906,
+      "loss": 0.7752,
+      "step": 2575
+    },
+    {
+      "epoch": 0.5763688760806917,
+      "grad_norm": 0.07766986638307571,
+      "learning_rate": 0.0001865286548428054,
+      "loss": 0.7108,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5819108845045444,
+      "grad_norm": 0.07919591665267944,
+      "learning_rate": 0.00018622718980912514,
+      "loss": 0.775,
+      "step": 2625
+    },
+    {
+      "epoch": 0.5874528929283972,
+      "grad_norm": 0.07524783164262772,
+      "learning_rate": 0.00018592263809458361,
+      "loss": 0.6941,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5929949013522501,
+      "grad_norm": 0.08549198508262634,
+      "learning_rate": 0.00018561501060124024,
+      "loss": 0.7718,
+      "step": 2675
+    },
+    {
+      "epoch": 0.5985369097761029,
+      "grad_norm": 0.08182788640260696,
+      "learning_rate": 0.0001853043183412584,
+      "loss": 0.7072,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6040789181999556,
+      "grad_norm": 0.084741972386837,
+      "learning_rate": 0.00018499057243651096,
+      "loss": 0.7478,
+      "step": 2725
+    },
+    {
+      "epoch": 0.6096209266238085,
+      "grad_norm": 0.06824459880590439,
+      "learning_rate": 0.0001846737841181825,
+      "loss": 0.7238,
+      "step": 2750
+    },
+    {
+      "epoch": 0.6151629350476613,
+      "grad_norm": 0.08315033465623856,
+      "learning_rate": 0.00018435396472636704,
+      "loss": 0.7597,
+      "step": 2775
+    },
+    {
+      "epoch": 0.6207049434715141,
+      "grad_norm": 0.07116558402776718,
+      "learning_rate": 0.00018403112570966216,
+      "loss": 0.7096,
+      "step": 2800
+    },
+    {
+      "epoch": 0.6262469518953668,
+      "grad_norm": 0.08500215411186218,
+      "learning_rate": 0.00018370527862475916,
+      "loss": 0.756,
+      "step": 2825
+    },
+    {
+      "epoch": 0.6317889603192197,
+      "grad_norm": 0.07979004830121994,
+      "learning_rate": 0.00018337643513602933,
+      "loss": 0.6886,
+      "step": 2850
+    },
+    {
+      "epoch": 0.6373309687430725,
+      "grad_norm": 0.08140358328819275,
+      "learning_rate": 0.00018304460701510652,
+      "loss": 0.7648,
+      "step": 2875
+    },
+    {
+      "epoch": 0.6428729771669253,
+      "grad_norm": 0.07779423147439957,
+      "learning_rate": 0.0001827098061404656,
+      "loss": 0.7222,
+      "step": 2900
+    },
+    {
+      "epoch": 0.6484149855907781,
+      "grad_norm": 0.08853591978549957,
+      "learning_rate": 0.0001823720444969974,
+      "loss": 0.7736,
+      "step": 2925
+    },
+    {
+      "epoch": 0.6539569940146309,
+      "grad_norm": 0.07350102066993713,
+      "learning_rate": 0.0001820313341755795,
+      "loss": 0.7256,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6594990024384837,
+      "grad_norm": 0.08152145147323608,
+      "learning_rate": 0.0001816876873726436,
+      "loss": 0.7598,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6650410108623365,
+      "grad_norm": 0.08045897632837296,
+      "learning_rate": 0.00018134111638973876,
+      "loss": 0.7275,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6705830192861894,
+      "grad_norm": 0.08514434099197388,
+      "learning_rate": 0.00018099163363309123,
+      "loss": 0.7688,
+      "step": 3025
+    },
+    {
+      "epoch": 0.6761250277100421,
+      "grad_norm": 0.060850344598293304,
+      "learning_rate": 0.00018063925161316012,
+      "loss": 0.7019,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6816670361338949,
+      "grad_norm": 0.08471492677927017,
+      "learning_rate": 0.00018028398294418977,
+      "loss": 0.7573,
+      "step": 3075
+    },
+    {
+      "epoch": 0.6872090445577477,
+      "grad_norm": 0.0642291009426117,
+      "learning_rate": 0.00017992584034375798,
+      "loss": 0.7108,
+      "step": 3100
+    },
+    {
+      "epoch": 0.6927510529816006,
+      "grad_norm": 0.09357668459415436,
+      "learning_rate": 0.000179564836632321,
+      "loss": 0.7478,
+      "step": 3125
+    },
+    {
+      "epoch": 0.6982930614054533,
+      "grad_norm": 0.07198700308799744,
+      "learning_rate": 0.00017920098473275445,
+      "loss": 0.6973,
+      "step": 3150
+    },
+    {
+      "epoch": 0.7038350698293061,
+      "grad_norm": 0.08420095592737198,
+      "learning_rate": 0.00017883429766989064,
+      "loss": 0.7487,
+      "step": 3175
+    },
+    {
+      "epoch": 0.709377078253159,
+      "grad_norm": 0.06639819592237473,
+      "learning_rate": 0.00017846478857005255,
+      "loss": 0.6741,
+      "step": 3200
+    },
+    {
+      "epoch": 0.7149190866770118,
+      "grad_norm": 0.08200914412736893,
+      "learning_rate": 0.00017809247066058378,
+      "loss": 0.7526,
+      "step": 3225
+    },
+    {
+      "epoch": 0.7204610951008645,
+      "grad_norm": 0.07311141490936279,
+      "learning_rate": 0.0001777173572693751,
+      "loss": 0.677,
+      "step": 3250
+    },
+    {
+      "epoch": 0.7260031035247173,
+      "grad_norm": 0.08722089231014252,
+      "learning_rate": 0.00017733946182438726,
+      "loss": 0.7585,
+      "step": 3275
+    },
+    {
+      "epoch": 0.7315451119485702,
+      "grad_norm": 0.06589449942111969,
+      "learning_rate": 0.00017695879785317048,
+      "loss": 0.708,
+      "step": 3300
+    },
+    {
+      "epoch": 0.737087120372423,
+      "grad_norm": 0.08262074738740921,
+      "learning_rate": 0.0001765753789823801,
+      "loss": 0.749,
+      "step": 3325
+    },
+    {
+      "epoch": 0.7426291287962757,
+      "grad_norm": 0.07514823973178864,
+      "learning_rate": 0.00017618921893728867,
+      "loss": 0.6918,
+      "step": 3350
+    },
+    {
+      "epoch": 0.7481711372201286,
+      "grad_norm": 0.08757175505161285,
+      "learning_rate": 0.00017580033154129503,
+      "loss": 0.7445,
+      "step": 3375
+    },
+    {
+      "epoch": 0.7537131456439814,
+      "grad_norm": 0.0716458335518837,
+      "learning_rate": 0.0001754087307154289,
+      "loss": 0.7122,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7592551540678342,
+      "grad_norm": 0.08453212678432465,
+      "learning_rate": 0.00017501443047785296,
+      "loss": 0.7656,
+      "step": 3425
+    },
+    {
+      "epoch": 0.7647971624916869,
+      "grad_norm": 0.06761575490236282,
+      "learning_rate": 0.00017461744494336098,
+      "loss": 0.6673,
+      "step": 3450
+    },
+    {
+      "epoch": 0.7703391709155398,
+      "grad_norm": 0.08577297627925873,
+      "learning_rate": 0.0001742177883228724,
+      "loss": 0.7494,
+      "step": 3475
+    },
+    {
+      "epoch": 0.7758811793393926,
+      "grad_norm": 0.05691730976104736,
+      "learning_rate": 0.00017381547492292376,
+      "loss": 0.6972,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7814231877632454,
+      "grad_norm": 0.09115194529294968,
+      "learning_rate": 0.00017341051914515656,
+      "loss": 0.7706,
+      "step": 3525
+    },
+    {
+      "epoch": 0.7869651961870983,
+      "grad_norm": 0.07214304804801941,
+      "learning_rate": 0.00017300293548580162,
+      "loss": 0.6807,
+      "step": 3550
+    },
+    {
+      "epoch": 0.792507204610951,
+      "grad_norm": 0.08448139578104019,
+      "learning_rate": 0.00017259273853516028,
+      "loss": 0.7661,
+      "step": 3575
+    },
+    {
+      "epoch": 0.7980492130348038,
+      "grad_norm": 0.08282499015331268,
+      "learning_rate": 0.00017217994297708195,
+      "loss": 0.7391,
+      "step": 3600
+    },
+    {
+      "epoch": 0.8035912214586566,
+      "grad_norm": 0.0804004818201065,
+      "learning_rate": 0.00017176456358843875,
+      "loss": 0.7402,
+      "step": 3625
+    },
+    {
+      "epoch": 0.8091332298825095,
+      "grad_norm": 0.07265755534172058,
+      "learning_rate": 0.00017134661523859622,
+      "loss": 0.7019,
+      "step": 3650
+    },
+    {
+      "epoch": 0.8146752383063622,
+      "grad_norm": 0.08803457766771317,
+      "learning_rate": 0.00017092611288888125,
+      "loss": 0.7572,
+      "step": 3675
+    },
+    {
+      "epoch": 0.820217246730215,
+      "grad_norm": 0.0652441680431366,
+      "learning_rate": 0.0001705030715920464,
+      "loss": 0.706,
+      "step": 3700
+    },
+    {
+      "epoch": 0.8257592551540678,
+      "grad_norm": 0.08185753971338272,
+      "learning_rate": 0.0001700775064917312,
+      "loss": 0.764,
+      "step": 3725
+    },
+    {
+      "epoch": 0.8313012635779207,
+      "grad_norm": 0.0859500914812088,
+      "learning_rate": 0.00016964943282191984,
+      "loss": 0.6927,
+      "step": 3750
+    },
+    {
+      "epoch": 0.8368432720017734,
+      "grad_norm": 0.09176376461982727,
+      "learning_rate": 0.00016921886590639602,
+      "loss": 0.7567,
+      "step": 3775
+    },
+    {
+      "epoch": 0.8423852804256262,
+      "grad_norm": 0.0646485984325409,
+      "learning_rate": 0.0001687858211581943,
+      "loss": 0.6848,
+      "step": 3800
+    },
+    {
+      "epoch": 0.8479272888494791,
+      "grad_norm": 0.08545655012130737,
+      "learning_rate": 0.00016835031407904839,
+      "loss": 0.7546,
+      "step": 3825
+    },
+    {
+      "epoch": 0.8534692972733319,
+      "grad_norm": 0.06338818371295929,
+      "learning_rate": 0.00016791236025883626,
+      "loss": 0.6655,
+      "step": 3850
+    },
+    {
+      "epoch": 0.8590113056971846,
+      "grad_norm": 0.08781229704618454,
+      "learning_rate": 0.00016747197537502205,
+      "loss": 0.7441,
+      "step": 3875
+    },
+    {
+      "epoch": 0.8645533141210374,
+      "grad_norm": 0.06220358610153198,
+      "learning_rate": 0.00016702917519209487,
+      "loss": 0.6795,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8700953225448903,
+      "grad_norm": 0.08917712420225143,
+      "learning_rate": 0.0001665839755610044,
+      "loss": 0.7552,
+      "step": 3925
+    },
+    {
+      "epoch": 0.8756373309687431,
+      "grad_norm": 0.06624036282300949,
+      "learning_rate": 0.00016613639241859355,
+      "loss": 0.6632,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8811793393925959,
+      "grad_norm": 0.08898719400167465,
+      "learning_rate": 0.00016568644178702803,
+      "loss": 0.757,
+      "step": 3975
+    },
+    {
+      "epoch": 0.8867213478164487,
+      "grad_norm": 0.05095354840159416,
+      "learning_rate": 0.0001652341397732227,
+      "loss": 0.6992,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8922633562403015,
+      "grad_norm": 0.08842916786670685,
+      "learning_rate": 0.0001647795025682649,
+      "loss": 0.7504,
+      "step": 4025
+    },
+    {
+      "epoch": 0.8978053646641543,
+      "grad_norm": 0.0758206844329834,
+      "learning_rate": 0.00016432254644683516,
+      "loss": 0.7081,
+      "step": 4050
+    },
+    {
+      "epoch": 0.903347373088007,
+      "grad_norm": 0.0940496176481247,
+      "learning_rate": 0.0001638632877666243,
+      "loss": 0.746,
+      "step": 4075
+    },
+    {
+      "epoch": 0.9088893815118599,
+      "grad_norm": 0.06626766920089722,
+      "learning_rate": 0.00016340174296774804,
+      "loss": 0.6647,
+      "step": 4100
+    },
+    {
+      "epoch": 0.9144313899357127,
+      "grad_norm": 0.08919317275285721,
+      "learning_rate": 0.00016293792857215844,
+      "loss": 0.7516,
+      "step": 4125
+    },
+    {
+      "epoch": 0.9199733983595655,
+      "grad_norm": 0.06990760564804077,
+      "learning_rate": 0.00016247186118305252,
+      "loss": 0.7011,
+      "step": 4150
+    },
+    {
+      "epoch": 0.9255154067834183,
+      "grad_norm": 0.0870794802904129,
+      "learning_rate": 0.00016200355748427782,
+      "loss": 0.7529,
+      "step": 4175
+    },
+    {
+      "epoch": 0.9310574152072711,
+      "grad_norm": 0.06882854551076889,
+      "learning_rate": 0.00016153303423973526,
+      "loss": 0.7005,
+      "step": 4200
+    },
+    {
+      "epoch": 0.9365994236311239,
+      "grad_norm": 0.084992416203022,
+      "learning_rate": 0.0001610603082927789,
+      "loss": 0.7519,
+      "step": 4225
+    },
+    {
+      "epoch": 0.9421414320549767,
+      "grad_norm": 0.0638299211859703,
+      "learning_rate": 0.00016058539656561323,
+      "loss": 0.716,
+      "step": 4250
+    },
+    {
+      "epoch": 0.9476834404788296,
+      "grad_norm": 0.08899606764316559,
+      "learning_rate": 0.00016010831605868715,
+      "loss": 0.7257,
+      "step": 4275
+    },
+    {
+      "epoch": 0.9532254489026823,
+      "grad_norm": 0.06550378352403641,
+      "learning_rate": 0.00015962908385008565,
+      "loss": 0.7174,
+      "step": 4300
+    },
+    {
+      "epoch": 0.9587674573265351,
+      "grad_norm": 0.09001540392637253,
+      "learning_rate": 0.00015914771709491828,
+      "loss": 0.7271,
+      "step": 4325
+    },
+    {
+      "epoch": 0.9643094657503879,
+      "grad_norm": 0.06641615182161331,
+      "learning_rate": 0.000158664233024705,
+      "loss": 0.69,
+      "step": 4350
+    },
+    {
+      "epoch": 0.9698514741742408,
+      "grad_norm": 0.08917039632797241,
+      "learning_rate": 0.0001581786489467596,
+      "loss": 0.7483,
+      "step": 4375
+    },
+    {
+      "epoch": 0.9753934825980936,
+      "grad_norm": 0.05995697155594826,
+      "learning_rate": 0.00015769098224356992,
+      "loss": 0.7033,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9809354910219463,
+      "grad_norm": 0.08998765051364899,
+      "learning_rate": 0.00015720125037217572,
+      "loss": 0.7462,
+      "step": 4425
+    },
+    {
+      "epoch": 0.9864774994457992,
+      "grad_norm": 0.05868702754378319,
+      "learning_rate": 0.00015670947086354376,
+      "loss": 0.6654,
+      "step": 4450
+    },
+    {
+      "epoch": 0.992019507869652,
+      "grad_norm": 0.0880926102399826,
+      "learning_rate": 0.00015621566132194005,
+      "loss": 0.752,
+      "step": 4475
+    },
+    {
+      "epoch": 0.9975615162935048,
+      "grad_norm": 0.08538970351219177,
+      "learning_rate": 0.00015571983942430005,
+      "loss": 0.7338,
+      "step": 4500
+    },
+    {
+      "epoch": 1.0031035247173576,
+      "grad_norm": 0.0827050730586052,
+      "learning_rate": 0.0001552220229195956,
+      "loss": 0.7174,
+      "step": 4525
+    },
+    {
+      "epoch": 1.0086455331412103,
+      "grad_norm": 0.10867294669151306,
+      "learning_rate": 0.00015472222962819955,
+      "loss": 0.7637,
+      "step": 4550
+    },
+    {
+      "epoch": 1.0141875415650632,
+      "grad_norm": 0.08738269656896591,
+      "learning_rate": 0.00015422047744124802,
+      "loss": 0.6247,
+      "step": 4575
+    },
+    {
+      "epoch": 1.019729549988916,
+      "grad_norm": 0.12865987420082092,
+      "learning_rate": 0.0001537167843199998,
+      "loss": 0.7424,
+      "step": 4600
+    },
+    {
+      "epoch": 1.0252715584127687,
+      "grad_norm": 0.08619695156812668,
+      "learning_rate": 0.00015321116829519345,
+      "loss": 0.6461,
+      "step": 4625
+    },
+    {
+      "epoch": 1.0308135668366216,
+      "grad_norm": 0.11726492643356323,
+      "learning_rate": 0.0001527036474664019,
+      "loss": 0.7433,
+      "step": 4650
+    },
+    {
+      "epoch": 1.0363555752604743,
+      "grad_norm": 0.08198727667331696,
+      "learning_rate": 0.0001521942400013844,
+      "loss": 0.6086,
+      "step": 4675
+    },
+    {
+      "epoch": 1.0418975836843272,
+      "grad_norm": 0.11951526254415512,
+      "learning_rate": 0.00015168296413543635,
+      "loss": 0.7521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.04743959210818,
+      "grad_norm": 0.08714735507965088,
+      "learning_rate": 0.0001511698381707363,
+      "loss": 0.631,
+      "step": 4725
+    },
+    {
+      "epoch": 1.0529816005320327,
+      "grad_norm": 0.13869455456733704,
+      "learning_rate": 0.00015065488047569107,
+      "loss": 0.7524,
+      "step": 4750
+    },
+    {
+      "epoch": 1.0585236089558856,
+      "grad_norm": 0.08524268865585327,
+      "learning_rate": 0.00015013810948427794,
+      "loss": 0.6617,
+      "step": 4775
+    },
+    {
+      "epoch": 1.0640656173797385,
+      "grad_norm": 0.11017199605703354,
+      "learning_rate": 0.00014961954369538494,
+      "loss": 0.7598,
+      "step": 4800
+    },
+    {
+      "epoch": 1.0696076258035911,
+      "grad_norm": 0.0834374874830246,
+      "learning_rate": 0.00014909920167214858,
+      "loss": 0.627,
+      "step": 4825
+    },
+    {
+      "epoch": 1.075149634227444,
+      "grad_norm": 0.1357167363166809,
+      "learning_rate": 0.0001485771020412894,
+      "loss": 0.7466,
+      "step": 4850
+    },
+    {
+      "epoch": 1.080691642651297,
+      "grad_norm": 0.08910629153251648,
+      "learning_rate": 0.00014805326349244503,
+      "loss": 0.6238,
+      "step": 4875
+    },
+    {
+      "epoch": 1.0862336510751496,
+      "grad_norm": 0.10706546157598495,
+      "learning_rate": 0.00014752770477750144,
+      "loss": 0.7533,
+      "step": 4900
+    },
+    {
+      "epoch": 1.0917756594990025,
+      "grad_norm": 0.09201759845018387,
+      "learning_rate": 0.00014700044470992136,
+      "loss": 0.6521,
+      "step": 4925
+    },
+    {
+      "epoch": 1.0973176679228553,
+      "grad_norm": 0.14048361778259277,
+      "learning_rate": 0.00014647150216407106,
+      "loss": 0.7412,
+      "step": 4950
+    },
+    {
+      "epoch": 1.102859676346708,
+      "grad_norm": 0.08308299630880356,
+      "learning_rate": 0.00014594089607454454,
+      "loss": 0.6333,
+      "step": 4975
+    },
+    {
+      "epoch": 1.108401684770561,
+      "grad_norm": 0.12057497352361679,
+      "learning_rate": 0.00014540864543548582,
+      "loss": 0.7538,
+      "step": 5000
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 13533,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.4028324505774285e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}