wmaousley commited on 22 days ago

Commit

dec813c

verified ·

1 Parent(s): 0f0df11

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.gitattributes +1 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
checkpoint-40000/README.md +202 -0
checkpoint-40000/adapter_config.json +31 -0
checkpoint-40000/adapter_model.safetensors +3 -0
checkpoint-40000/optimizer.pt +3 -0
checkpoint-40000/rng_state.pth +3 -0
checkpoint-40000/scheduler.pt +3 -0
checkpoint-40000/trainer_state.json +3153 -0
checkpoint-40000/training_args.bin +3 -0
checkpoint-41000/adapter_config.json +31 -0
checkpoint-41000/adapter_model.safetensors +3 -0
checkpoint-41000/optimizer.pt +3 -0
checkpoint-41000/scheduler.pt +3 -0
checkpoint-41000/trainer_state.json +3231 -0
checkpoint-41000/training_args.bin +3 -0
checkpoint-41136/README.md +202 -0
checkpoint-41136/adapter_config.json +31 -0
checkpoint-41136/adapter_model.safetensors +3 -0
checkpoint-41136/optimizer.pt +3 -0
checkpoint-41136/rng_state.pth +3 -0
checkpoint-41136/scheduler.pt +3 -0
checkpoint-41136/trainer_state.json +3238 -0
checkpoint-41136/training_args.bin +3 -0
merges.txt +0 -0
special_tokens_map.json +14 -0
tokenizer.json +3 -0
tokenizer_config.json +43 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d856104b781021c43fad86f0478030885797265c8d4fffb66447b5b720f4a7
+size 8676008

checkpoint-40000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-0.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

checkpoint-40000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-40000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acd9947a02ce4df1a5163b8a15fb20d09a8607496e30e80d355eff380d1c5318
+size 8676008

checkpoint-40000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9282d1426027801df62d80ab2f99ad7ad22f1962924811bd5beb4ccc57f70ffa
+size 17463051

checkpoint-40000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:564569ea28938ad2cee9368c34eacaf7d2105aa7bf36e9bf0b4711f73c4711d7
+size 14645

checkpoint-40000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7e2760ee3e1c81b83ba7a0a9ee1ed6fc636d614401727fe15fe3af580dbab5c
+size 1465

checkpoint-40000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3153 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.917066233979326,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007292483272866493,
+      "grad_norm": 2.1235318183898926,
+      "learning_rate": 4e-05,
+      "loss": 2.7429,
+      "step": 100
+    },
+    {
+      "epoch": 0.014584966545732986,
+      "grad_norm": 1.9533482789993286,
+      "learning_rate": 8e-05,
+      "loss": 1.4786,
+      "step": 200
+    },
+    {
+      "epoch": 0.02187744981859948,
+      "grad_norm": 1.5908012390136719,
+      "learning_rate": 0.00012,
+      "loss": 1.252,
+      "step": 300
+    },
+    {
+      "epoch": 0.029169933091465972,
+      "grad_norm": 1.592781662940979,
+      "learning_rate": 0.00016,
+      "loss": 1.1674,
+      "step": 400
+    },
+    {
+      "epoch": 0.036462416364332464,
+      "grad_norm": 1.4071415662765503,
+      "learning_rate": 0.0002,
+      "loss": 1.101,
+      "step": 500
+    },
+    {
+      "epoch": 0.04375489963719896,
+      "grad_norm": 1.4228886365890503,
+      "learning_rate": 0.0001995078255733832,
+      "loss": 1.0487,
+      "step": 600
+    },
+    {
+      "epoch": 0.05104738291006545,
+      "grad_norm": 1.2705847024917603,
+      "learning_rate": 0.00019901565114676642,
+      "loss": 1.0119,
+      "step": 700
+    },
+    {
+      "epoch": 0.058339866182931945,
+      "grad_norm": 1.1770137548446655,
+      "learning_rate": 0.00019852347672014964,
+      "loss": 0.9906,
+      "step": 800
+    },
+    {
+      "epoch": 0.06563234945579843,
+      "grad_norm": 1.1681164503097534,
+      "learning_rate": 0.00019803130229353283,
+      "loss": 0.9645,
+      "step": 900
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "grad_norm": 1.020504117012024,
+      "learning_rate": 0.00019753912786691605,
+      "loss": 0.9525,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "eval_loss": 0.9407642483711243,
+      "eval_runtime": 61.0906,
+      "eval_samples_per_second": 146.586,
+      "eval_steps_per_second": 18.333,
+      "step": 1000
+    },
+    {
+      "epoch": 0.08021731600153142,
+      "grad_norm": 1.079444408416748,
+      "learning_rate": 0.00019704695344029924,
+      "loss": 0.9414,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08750979927439792,
+      "grad_norm": 1.057377576828003,
+      "learning_rate": 0.00019655477901368246,
+      "loss": 0.9231,
+      "step": 1200
+    },
+    {
+      "epoch": 0.0948022825472644,
+      "grad_norm": 1.068018913269043,
+      "learning_rate": 0.00019606260458706568,
+      "loss": 0.9168,
+      "step": 1300
+    },
+    {
+      "epoch": 0.1020947658201309,
+      "grad_norm": 0.9460920095443726,
+      "learning_rate": 0.00019557043016044887,
+      "loss": 0.9031,
+      "step": 1400
+    },
+    {
+      "epoch": 0.1093872490929974,
+      "grad_norm": 1.056226134300232,
+      "learning_rate": 0.00019507825573383206,
+      "loss": 0.8901,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11667973236586389,
+      "grad_norm": 1.0429835319519043,
+      "learning_rate": 0.00019458608130721528,
+      "loss": 0.8928,
+      "step": 1600
+    },
+    {
+      "epoch": 0.12397221563873038,
+      "grad_norm": 1.050790548324585,
+      "learning_rate": 0.0001940939068805985,
+      "loss": 0.8803,
+      "step": 1700
+    },
+    {
+      "epoch": 0.13126469891159687,
+      "grad_norm": 0.9586555361747742,
+      "learning_rate": 0.0001936017324539817,
+      "loss": 0.8809,
+      "step": 1800
+    },
+    {
+      "epoch": 0.13855718218446336,
+      "grad_norm": 0.985379159450531,
+      "learning_rate": 0.00019310955802736491,
+      "loss": 0.8743,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "grad_norm": 0.9307010769844055,
+      "learning_rate": 0.00019261738360074813,
+      "loss": 0.8727,
+      "step": 2000
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "eval_loss": 0.86456698179245,
+      "eval_runtime": 60.6283,
+      "eval_samples_per_second": 147.703,
+      "eval_steps_per_second": 18.473,
+      "step": 2000
+    },
+    {
+      "epoch": 0.15314214873019635,
+      "grad_norm": 1.0384063720703125,
+      "learning_rate": 0.00019212520917413133,
+      "loss": 0.8742,
+      "step": 2100
+    },
+    {
+      "epoch": 0.16043463200306285,
+      "grad_norm": 0.9662402868270874,
+      "learning_rate": 0.00019163303474751452,
+      "loss": 0.8661,
+      "step": 2200
+    },
+    {
+      "epoch": 0.16772711527592934,
+      "grad_norm": 0.9773098230361938,
+      "learning_rate": 0.00019114086032089774,
+      "loss": 0.8576,
+      "step": 2300
+    },
+    {
+      "epoch": 0.17501959854879584,
+      "grad_norm": 0.9672012329101562,
+      "learning_rate": 0.00019064868589428093,
+      "loss": 0.8595,
+      "step": 2400
+    },
+    {
+      "epoch": 0.1823120818216623,
+      "grad_norm": 0.9758124351501465,
+      "learning_rate": 0.00019015651146766415,
+      "loss": 0.8524,
+      "step": 2500
+    },
+    {
+      "epoch": 0.1896045650945288,
+      "grad_norm": 0.972232460975647,
+      "learning_rate": 0.00018966433704104737,
+      "loss": 0.8468,
+      "step": 2600
+    },
+    {
+      "epoch": 0.1968970483673953,
+      "grad_norm": 0.9417553544044495,
+      "learning_rate": 0.00018917216261443056,
+      "loss": 0.8412,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2041895316402618,
+      "grad_norm": 0.9395071864128113,
+      "learning_rate": 0.00018867998818781375,
+      "loss": 0.8413,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2114820149131283,
+      "grad_norm": 0.9951208233833313,
+      "learning_rate": 0.000188187813761197,
+      "loss": 0.8345,
+      "step": 2900
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "grad_norm": 0.9656242728233337,
+      "learning_rate": 0.0001876956393345802,
+      "loss": 0.8317,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "eval_loss": 0.8318613767623901,
+      "eval_runtime": 61.1356,
+      "eval_samples_per_second": 146.478,
+      "eval_steps_per_second": 18.32,
+      "step": 3000
+    },
+    {
+      "epoch": 0.22606698145886128,
+      "grad_norm": 0.8810185194015503,
+      "learning_rate": 0.00018720346490796338,
+      "loss": 0.8321,
+      "step": 3100
+    },
+    {
+      "epoch": 0.23335946473172778,
+      "grad_norm": 0.9199262857437134,
+      "learning_rate": 0.0001867112904813466,
+      "loss": 0.8406,
+      "step": 3200
+    },
+    {
+      "epoch": 0.24065194800459427,
+      "grad_norm": 0.9557051658630371,
+      "learning_rate": 0.00018621911605472982,
+      "loss": 0.8277,
+      "step": 3300
+    },
+    {
+      "epoch": 0.24794443127746077,
+      "grad_norm": 0.9777804017066956,
+      "learning_rate": 0.000185726941628113,
+      "loss": 0.8272,
+      "step": 3400
+    },
+    {
+      "epoch": 0.25523691455032727,
+      "grad_norm": 0.8856322169303894,
+      "learning_rate": 0.00018523476720149623,
+      "loss": 0.8256,
+      "step": 3500
+    },
+    {
+      "epoch": 0.26252939782319373,
+      "grad_norm": 0.9196017980575562,
+      "learning_rate": 0.00018474259277487942,
+      "loss": 0.8234,
+      "step": 3600
+    },
+    {
+      "epoch": 0.26982188109606026,
+      "grad_norm": 0.9568464159965515,
+      "learning_rate": 0.00018425041834826264,
+      "loss": 0.8193,
+      "step": 3700
+    },
+    {
+      "epoch": 0.2771143643689267,
+      "grad_norm": 0.9552770256996155,
+      "learning_rate": 0.00018375824392164583,
+      "loss": 0.8179,
+      "step": 3800
+    },
+    {
+      "epoch": 0.28440684764179325,
+      "grad_norm": 0.8997077345848083,
+      "learning_rate": 0.00018326606949502905,
+      "loss": 0.8138,
+      "step": 3900
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "grad_norm": 0.8896480202674866,
+      "learning_rate": 0.00018277389506841224,
+      "loss": 0.8172,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "eval_loss": 0.8123040199279785,
+      "eval_runtime": 60.7914,
+      "eval_samples_per_second": 147.307,
+      "eval_steps_per_second": 18.424,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2989918141875262,
+      "grad_norm": 0.9520764350891113,
+      "learning_rate": 0.00018228172064179546,
+      "loss": 0.8183,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3062842974603927,
+      "grad_norm": 0.9373065233230591,
+      "learning_rate": 0.00018178954621517868,
+      "loss": 0.8132,
+      "step": 4200
+    },
+    {
+      "epoch": 0.3135767807332592,
+      "grad_norm": 0.8733066916465759,
+      "learning_rate": 0.00018129737178856187,
+      "loss": 0.811,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3208692640061257,
+      "grad_norm": 0.8866516351699829,
+      "learning_rate": 0.00018080519736194507,
+      "loss": 0.8093,
+      "step": 4400
+    },
+    {
+      "epoch": 0.32816174727899217,
+      "grad_norm": 0.9394953846931458,
+      "learning_rate": 0.00018031302293532828,
+      "loss": 0.8035,
+      "step": 4500
+    },
+    {
+      "epoch": 0.3354542305518587,
+      "grad_norm": 0.9133720993995667,
+      "learning_rate": 0.0001798208485087115,
+      "loss": 0.8054,
+      "step": 4600
+    },
+    {
+      "epoch": 0.34274671382472516,
+      "grad_norm": 0.9428606629371643,
+      "learning_rate": 0.0001793286740820947,
+      "loss": 0.8076,
+      "step": 4700
+    },
+    {
+      "epoch": 0.3500391970975917,
+      "grad_norm": 0.8996593356132507,
+      "learning_rate": 0.00017883649965547792,
+      "loss": 0.812,
+      "step": 4800
+    },
+    {
+      "epoch": 0.35733168037045815,
+      "grad_norm": 0.9113749265670776,
+      "learning_rate": 0.0001783443252288611,
+      "loss": 0.8048,
+      "step": 4900
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "grad_norm": 0.9185646176338196,
+      "learning_rate": 0.00017785215080224433,
+      "loss": 0.8023,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "eval_loss": 0.7973803877830505,
+      "eval_runtime": 60.8068,
+      "eval_samples_per_second": 147.27,
+      "eval_steps_per_second": 18.419,
+      "step": 5000
+    },
+    {
+      "epoch": 0.37191664691619114,
+      "grad_norm": 0.8994658589363098,
+      "learning_rate": 0.00017735997637562755,
+      "loss": 0.8089,
+      "step": 5100
+    },
+    {
+      "epoch": 0.3792091301890576,
+      "grad_norm": 0.8724523782730103,
+      "learning_rate": 0.00017686780194901074,
+      "loss": 0.8015,
+      "step": 5200
+    },
+    {
+      "epoch": 0.38650161346192413,
+      "grad_norm": 0.8285540342330933,
+      "learning_rate": 0.00017637562752239393,
+      "loss": 0.7944,
+      "step": 5300
+    },
+    {
+      "epoch": 0.3937940967347906,
+      "grad_norm": 0.8982509970664978,
+      "learning_rate": 0.00017588345309577718,
+      "loss": 0.7952,
+      "step": 5400
+    },
+    {
+      "epoch": 0.4010865800076571,
+      "grad_norm": 0.9266172051429749,
+      "learning_rate": 0.00017539127866916037,
+      "loss": 0.7978,
+      "step": 5500
+    },
+    {
+      "epoch": 0.4083790632805236,
+      "grad_norm": 0.901662290096283,
+      "learning_rate": 0.00017489910424254356,
+      "loss": 0.7966,
+      "step": 5600
+    },
+    {
+      "epoch": 0.4156715465533901,
+      "grad_norm": 0.9309051036834717,
+      "learning_rate": 0.00017440692981592678,
+      "loss": 0.7975,
+      "step": 5700
+    },
+    {
+      "epoch": 0.4229640298262566,
+      "grad_norm": 0.8789328336715698,
+      "learning_rate": 0.00017391475538930997,
+      "loss": 0.7997,
+      "step": 5800
+    },
+    {
+      "epoch": 0.4302565130991231,
+      "grad_norm": 0.8636139035224915,
+      "learning_rate": 0.0001734225809626932,
+      "loss": 0.7914,
+      "step": 5900
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "grad_norm": 0.9468287229537964,
+      "learning_rate": 0.00017293040653607638,
+      "loss": 0.7859,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "eval_loss": 0.7869976162910461,
+      "eval_runtime": 60.7741,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 6000
+    },
+    {
+      "epoch": 0.44484147964485604,
+      "grad_norm": 0.867158055305481,
+      "learning_rate": 0.0001724382321094596,
+      "loss": 0.7924,
+      "step": 6100
+    },
+    {
+      "epoch": 0.45213396291772256,
+      "grad_norm": 0.9379836320877075,
+      "learning_rate": 0.0001719460576828428,
+      "loss": 0.7902,
+      "step": 6200
+    },
+    {
+      "epoch": 0.45942644619058903,
+      "grad_norm": 0.8591951727867126,
+      "learning_rate": 0.000171453883256226,
+      "loss": 0.7926,
+      "step": 6300
+    },
+    {
+      "epoch": 0.46671892946345556,
+      "grad_norm": 0.9702317118644714,
+      "learning_rate": 0.00017096170882960923,
+      "loss": 0.7867,
+      "step": 6400
+    },
+    {
+      "epoch": 0.474011412736322,
+      "grad_norm": 0.902302086353302,
+      "learning_rate": 0.00017046953440299242,
+      "loss": 0.7897,
+      "step": 6500
+    },
+    {
+      "epoch": 0.48130389600918855,
+      "grad_norm": 0.889926552772522,
+      "learning_rate": 0.00016997735997637561,
+      "loss": 0.7857,
+      "step": 6600
+    },
+    {
+      "epoch": 0.488596379282055,
+      "grad_norm": 0.8906420469284058,
+      "learning_rate": 0.00016948518554975886,
+      "loss": 0.7878,
+      "step": 6700
+    },
+    {
+      "epoch": 0.49588886255492154,
+      "grad_norm": 0.919983983039856,
+      "learning_rate": 0.00016899301112314205,
+      "loss": 0.7876,
+      "step": 6800
+    },
+    {
+      "epoch": 0.5031813458277881,
+      "grad_norm": 0.8610624670982361,
+      "learning_rate": 0.00016850083669652524,
+      "loss": 0.7923,
+      "step": 6900
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "grad_norm": 0.9339637160301208,
+      "learning_rate": 0.00016800866226990846,
+      "loss": 0.7837,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "eval_loss": 0.7791191935539246,
+      "eval_runtime": 60.8878,
+      "eval_samples_per_second": 147.074,
+      "eval_steps_per_second": 18.395,
+      "step": 7000
+    },
+    {
+      "epoch": 0.517766312373521,
+      "grad_norm": 0.9073446393013,
+      "learning_rate": 0.00016751648784329168,
+      "loss": 0.7809,
+      "step": 7100
+    },
+    {
+      "epoch": 0.5250587956463875,
+      "grad_norm": 0.9348235726356506,
+      "learning_rate": 0.00016702431341667487,
+      "loss": 0.7793,
+      "step": 7200
+    },
+    {
+      "epoch": 0.5323512789192539,
+      "grad_norm": 0.9155163168907166,
+      "learning_rate": 0.0001665321389900581,
+      "loss": 0.7821,
+      "step": 7300
+    },
+    {
+      "epoch": 0.5396437621921205,
+      "grad_norm": 0.9328250885009766,
+      "learning_rate": 0.00016603996456344129,
+      "loss": 0.7806,
+      "step": 7400
+    },
+    {
+      "epoch": 0.546936245464987,
+      "grad_norm": 0.8911275863647461,
+      "learning_rate": 0.00016554779013682448,
+      "loss": 0.7782,
+      "step": 7500
+    },
+    {
+      "epoch": 0.5542287287378534,
+      "grad_norm": 0.8989250659942627,
+      "learning_rate": 0.00016505561571020772,
+      "loss": 0.779,
+      "step": 7600
+    },
+    {
+      "epoch": 0.5615212120107199,
+      "grad_norm": 0.8869723081588745,
+      "learning_rate": 0.00016456344128359092,
+      "loss": 0.7822,
+      "step": 7700
+    },
+    {
+      "epoch": 0.5688136952835865,
+      "grad_norm": 0.8631371259689331,
+      "learning_rate": 0.0001640712668569741,
+      "loss": 0.7768,
+      "step": 7800
+    },
+    {
+      "epoch": 0.576106178556453,
+      "grad_norm": 0.8868420720100403,
+      "learning_rate": 0.00016357909243035733,
+      "loss": 0.7834,
+      "step": 7900
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "grad_norm": 0.9253202080726624,
+      "learning_rate": 0.00016308691800374055,
+      "loss": 0.773,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "eval_loss": 0.7733862400054932,
+      "eval_runtime": 60.8911,
+      "eval_samples_per_second": 147.066,
+      "eval_steps_per_second": 18.394,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5906911451021859,
+      "grad_norm": 0.830760657787323,
+      "learning_rate": 0.00016259474357712374,
+      "loss": 0.7756,
+      "step": 8100
+    },
+    {
+      "epoch": 0.5979836283750524,
+      "grad_norm": 0.9371838569641113,
+      "learning_rate": 0.00016210256915050696,
+      "loss": 0.776,
+      "step": 8200
+    },
+    {
+      "epoch": 0.605276111647919,
+      "grad_norm": 0.8486947417259216,
+      "learning_rate": 0.00016161039472389015,
+      "loss": 0.7758,
+      "step": 8300
+    },
+    {
+      "epoch": 0.6125685949207854,
+      "grad_norm": 0.8888623118400574,
+      "learning_rate": 0.00016111822029727337,
+      "loss": 0.783,
+      "step": 8400
+    },
+    {
+      "epoch": 0.6198610781936519,
+      "grad_norm": 0.9176976084709167,
+      "learning_rate": 0.00016062604587065656,
+      "loss": 0.7782,
+      "step": 8500
+    },
+    {
+      "epoch": 0.6271535614665184,
+      "grad_norm": 0.90993732213974,
+      "learning_rate": 0.00016013387144403978,
+      "loss": 0.7741,
+      "step": 8600
+    },
+    {
+      "epoch": 0.6344460447393849,
+      "grad_norm": 0.8461544513702393,
+      "learning_rate": 0.00015964169701742297,
+      "loss": 0.7782,
+      "step": 8700
+    },
+    {
+      "epoch": 0.6417385280122514,
+      "grad_norm": 0.8642047643661499,
+      "learning_rate": 0.0001591495225908062,
+      "loss": 0.7706,
+      "step": 8800
+    },
+    {
+      "epoch": 0.6490310112851179,
+      "grad_norm": 0.8944571018218994,
+      "learning_rate": 0.0001586573481641894,
+      "loss": 0.7727,
+      "step": 8900
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "grad_norm": 0.9075286984443665,
+      "learning_rate": 0.0001581651737375726,
+      "loss": 0.7748,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "eval_loss": 0.7666329741477966,
+      "eval_runtime": 60.5924,
+      "eval_samples_per_second": 147.791,
+      "eval_steps_per_second": 18.484,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6636159778308508,
+      "grad_norm": 0.9164955615997314,
+      "learning_rate": 0.0001576729993109558,
+      "loss": 0.7792,
+      "step": 9100
+    },
+    {
+      "epoch": 0.6709084611037174,
+      "grad_norm": 0.8446054458618164,
+      "learning_rate": 0.000157180824884339,
+      "loss": 0.7661,
+      "step": 9200
+    },
+    {
+      "epoch": 0.6782009443765838,
+      "grad_norm": 0.8793991804122925,
+      "learning_rate": 0.00015668865045772223,
+      "loss": 0.7678,
+      "step": 9300
+    },
+    {
+      "epoch": 0.6854934276494503,
+      "grad_norm": 0.8772592544555664,
+      "learning_rate": 0.00015619647603110542,
+      "loss": 0.7708,
+      "step": 9400
+    },
+    {
+      "epoch": 0.6927859109223168,
+      "grad_norm": 0.854118824005127,
+      "learning_rate": 0.00015570430160448864,
+      "loss": 0.7616,
+      "step": 9500
+    },
+    {
+      "epoch": 0.7000783941951834,
+      "grad_norm": 0.8653910756111145,
+      "learning_rate": 0.00015521212717787183,
+      "loss": 0.767,
+      "step": 9600
+    },
+    {
+      "epoch": 0.7073708774680498,
+      "grad_norm": 0.8890120387077332,
+      "learning_rate": 0.00015471995275125505,
+      "loss": 0.7657,
+      "step": 9700
+    },
+    {
+      "epoch": 0.7146633607409163,
+      "grad_norm": 0.8451828360557556,
+      "learning_rate": 0.00015422777832463827,
+      "loss": 0.7656,
+      "step": 9800
+    },
+    {
+      "epoch": 0.7219558440137828,
+      "grad_norm": 0.9029329419136047,
+      "learning_rate": 0.00015373560389802146,
+      "loss": 0.7749,
+      "step": 9900
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "grad_norm": 0.8538834452629089,
+      "learning_rate": 0.00015324342947140466,
+      "loss": 0.763,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "eval_loss": 0.76123046875,
+      "eval_runtime": 60.847,
+      "eval_samples_per_second": 147.172,
+      "eval_steps_per_second": 18.407,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7365408105595158,
+      "grad_norm": 0.8594367504119873,
+      "learning_rate": 0.00015275125504478788,
+      "loss": 0.7693,
+      "step": 10100
+    },
+    {
+      "epoch": 0.7438332938323823,
+      "grad_norm": 0.8748040199279785,
+      "learning_rate": 0.0001522590806181711,
+      "loss": 0.7684,
+      "step": 10200
+    },
+    {
+      "epoch": 0.7511257771052487,
+      "grad_norm": 0.9177483320236206,
+      "learning_rate": 0.0001517669061915543,
+      "loss": 0.7599,
+      "step": 10300
+    },
+    {
+      "epoch": 0.7584182603781152,
+      "grad_norm": 0.8988757729530334,
+      "learning_rate": 0.0001512747317649375,
+      "loss": 0.7648,
+      "step": 10400
+    },
+    {
+      "epoch": 0.7657107436509818,
+      "grad_norm": 0.8735676407814026,
+      "learning_rate": 0.00015078255733832073,
+      "loss": 0.7656,
+      "step": 10500
+    },
+    {
+      "epoch": 0.7730032269238483,
+      "grad_norm": 0.8750614523887634,
+      "learning_rate": 0.00015029038291170392,
+      "loss": 0.7632,
+      "step": 10600
+    },
+    {
+      "epoch": 0.7802957101967147,
+      "grad_norm": 0.8786306381225586,
+      "learning_rate": 0.0001497982084850871,
+      "loss": 0.7659,
+      "step": 10700
+    },
+    {
+      "epoch": 0.7875881934695812,
+      "grad_norm": 0.811834990978241,
+      "learning_rate": 0.00014930603405847033,
+      "loss": 0.7652,
+      "step": 10800
+    },
+    {
+      "epoch": 0.7948806767424477,
+      "grad_norm": 0.8844282031059265,
+      "learning_rate": 0.00014881385963185352,
+      "loss": 0.7623,
+      "step": 10900
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "grad_norm": 0.8444844484329224,
+      "learning_rate": 0.00014832168520523674,
+      "loss": 0.7622,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "eval_loss": 0.75812828540802,
+      "eval_runtime": 60.7569,
+      "eval_samples_per_second": 147.391,
+      "eval_steps_per_second": 18.434,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8094656432881807,
+      "grad_norm": 0.8396947979927063,
+      "learning_rate": 0.00014782951077861996,
+      "loss": 0.7673,
+      "step": 11100
+    },
+    {
+      "epoch": 0.8167581265610472,
+      "grad_norm": 0.8890758752822876,
+      "learning_rate": 0.00014733733635200315,
+      "loss": 0.7551,
+      "step": 11200
+    },
+    {
+      "epoch": 0.8240506098339136,
+      "grad_norm": 0.8038908839225769,
+      "learning_rate": 0.00014684516192538634,
+      "loss": 0.7612,
+      "step": 11300
+    },
+    {
+      "epoch": 0.8313430931067802,
+      "grad_norm": 0.8224745392799377,
+      "learning_rate": 0.0001463529874987696,
+      "loss": 0.7618,
+      "step": 11400
+    },
+    {
+      "epoch": 0.8386355763796467,
+      "grad_norm": 0.8691264390945435,
+      "learning_rate": 0.00014586081307215278,
+      "loss": 0.7618,
+      "step": 11500
+    },
+    {
+      "epoch": 0.8459280596525132,
+      "grad_norm": 0.8442777395248413,
+      "learning_rate": 0.00014536863864553597,
+      "loss": 0.7671,
+      "step": 11600
+    },
+    {
+      "epoch": 0.8532205429253796,
+      "grad_norm": 0.8520532846450806,
+      "learning_rate": 0.0001448764642189192,
+      "loss": 0.7625,
+      "step": 11700
+    },
+    {
+      "epoch": 0.8605130261982462,
+      "grad_norm": 0.908760666847229,
+      "learning_rate": 0.0001443842897923024,
+      "loss": 0.7615,
+      "step": 11800
+    },
+    {
+      "epoch": 0.8678055094711127,
+      "grad_norm": 0.8004080057144165,
+      "learning_rate": 0.0001438921153656856,
+      "loss": 0.7632,
+      "step": 11900
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "grad_norm": 0.8449864983558655,
+      "learning_rate": 0.00014339994093906882,
+      "loss": 0.7574,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "eval_loss": 0.752128005027771,
+      "eval_runtime": 61.1399,
+      "eval_samples_per_second": 146.467,
+      "eval_steps_per_second": 18.319,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8823904760168456,
+      "grad_norm": 0.8218274116516113,
+      "learning_rate": 0.00014290776651245201,
+      "loss": 0.7555,
+      "step": 12100
+    },
+    {
+      "epoch": 0.8896829592897121,
+      "grad_norm": 0.8944920897483826,
+      "learning_rate": 0.00014241559208583523,
+      "loss": 0.7594,
+      "step": 12200
+    },
+    {
+      "epoch": 0.8969754425625787,
+      "grad_norm": 0.9254937767982483,
+      "learning_rate": 0.00014192341765921845,
+      "loss": 0.7598,
+      "step": 12300
+    },
+    {
+      "epoch": 0.9042679258354451,
+      "grad_norm": 0.8887091875076294,
+      "learning_rate": 0.00014143124323260164,
+      "loss": 0.7625,
+      "step": 12400
+    },
+    {
+      "epoch": 0.9115604091083116,
+      "grad_norm": 0.8478124737739563,
+      "learning_rate": 0.00014093906880598484,
+      "loss": 0.756,
+      "step": 12500
+    },
+    {
+      "epoch": 0.9188528923811781,
+      "grad_norm": 0.9377927780151367,
+      "learning_rate": 0.00014044689437936805,
+      "loss": 0.7606,
+      "step": 12600
+    },
+    {
+      "epoch": 0.9261453756540446,
+      "grad_norm": 0.838175892829895,
+      "learning_rate": 0.00013995471995275127,
+      "loss": 0.7605,
+      "step": 12700
+    },
+    {
+      "epoch": 0.9334378589269111,
+      "grad_norm": 0.8345216512680054,
+      "learning_rate": 0.00013946254552613447,
+      "loss": 0.7568,
+      "step": 12800
+    },
+    {
+      "epoch": 0.9407303421997776,
+      "grad_norm": 0.894477367401123,
+      "learning_rate": 0.00013897037109951766,
+      "loss": 0.7535,
+      "step": 12900
+    },
+    {
+      "epoch": 0.948022825472644,
+      "grad_norm": 0.849010169506073,
+      "learning_rate": 0.00013847819667290088,
+      "loss": 0.7465,
+      "step": 13000
+    },
+    {
+      "epoch": 0.948022825472644,
+      "eval_loss": 0.7492165565490723,
+      "eval_runtime": 60.7079,
+      "eval_samples_per_second": 147.51,
+      "eval_steps_per_second": 18.449,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9553153087455105,
+      "grad_norm": 0.8754207491874695,
+      "learning_rate": 0.0001379860222462841,
+      "loss": 0.7576,
+      "step": 13100
+    },
+    {
+      "epoch": 0.9626077920183771,
+      "grad_norm": 0.8984807133674622,
+      "learning_rate": 0.0001374938478196673,
+      "loss": 0.7493,
+      "step": 13200
+    },
+    {
+      "epoch": 0.9699002752912436,
+      "grad_norm": 0.8458361029624939,
+      "learning_rate": 0.0001370016733930505,
+      "loss": 0.7468,
+      "step": 13300
+    },
+    {
+      "epoch": 0.97719275856411,
+      "grad_norm": 0.9169609546661377,
+      "learning_rate": 0.0001365094989664337,
+      "loss": 0.7515,
+      "step": 13400
+    },
+    {
+      "epoch": 0.9844852418369765,
+      "grad_norm": 0.8027638792991638,
+      "learning_rate": 0.00013601732453981692,
+      "loss": 0.7551,
+      "step": 13500
+    },
+    {
+      "epoch": 0.9917777251098431,
+      "grad_norm": 0.8572927117347717,
+      "learning_rate": 0.00013552515011320014,
+      "loss": 0.7481,
+      "step": 13600
+    },
+    {
+      "epoch": 0.9990702083827095,
+      "grad_norm": 0.8624053001403809,
+      "learning_rate": 0.00013503297568658333,
+      "loss": 0.7481,
+      "step": 13700
+    },
+    {
+      "epoch": 1.0063991540719404,
+      "grad_norm": 0.8915347456932068,
+      "learning_rate": 0.00013454080125996652,
+      "loss": 0.7463,
+      "step": 13800
+    },
+    {
+      "epoch": 1.0136916373448068,
+      "grad_norm": 0.8233557939529419,
+      "learning_rate": 0.00013404862683334977,
+      "loss": 0.7398,
+      "step": 13900
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "grad_norm": 0.8467598557472229,
+      "learning_rate": 0.00013355645240673296,
+      "loss": 0.7402,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "eval_loss": 0.7458442449569702,
+      "eval_runtime": 60.6887,
+      "eval_samples_per_second": 147.556,
+      "eval_steps_per_second": 18.455,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0282766038905398,
+      "grad_norm": 0.852739691734314,
+      "learning_rate": 0.00013306427798011615,
+      "loss": 0.7436,
+      "step": 14100
+    },
+    {
+      "epoch": 1.0355690871634062,
+      "grad_norm": 0.8501101136207581,
+      "learning_rate": 0.00013257210355349937,
+      "loss": 0.7472,
+      "step": 14200
+    },
+    {
+      "epoch": 1.0428615704362727,
+      "grad_norm": 0.8830447793006897,
+      "learning_rate": 0.0001320799291268826,
+      "loss": 0.7438,
+      "step": 14300
+    },
+    {
+      "epoch": 1.0501540537091394,
+      "grad_norm": 0.8827272057533264,
+      "learning_rate": 0.00013158775470026578,
+      "loss": 0.7439,
+      "step": 14400
+    },
+    {
+      "epoch": 1.0574465369820059,
+      "grad_norm": 0.7875618934631348,
+      "learning_rate": 0.000131095580273649,
+      "loss": 0.7426,
+      "step": 14500
+    },
+    {
+      "epoch": 1.0647390202548723,
+      "grad_norm": 0.9906949996948242,
+      "learning_rate": 0.0001306034058470322,
+      "loss": 0.7418,
+      "step": 14600
+    },
+    {
+      "epoch": 1.0720315035277388,
+      "grad_norm": 0.8803852200508118,
+      "learning_rate": 0.00013011123142041538,
+      "loss": 0.7421,
+      "step": 14700
+    },
+    {
+      "epoch": 1.0793239868006053,
+      "grad_norm": 0.8951194286346436,
+      "learning_rate": 0.0001296190569937986,
+      "loss": 0.7429,
+      "step": 14800
+    },
+    {
+      "epoch": 1.0866164700734717,
+      "grad_norm": 0.8548495769500732,
+      "learning_rate": 0.00012912688256718182,
+      "loss": 0.7462,
+      "step": 14900
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "grad_norm": 0.9326722025871277,
+      "learning_rate": 0.00012863470814056501,
+      "loss": 0.7515,
+      "step": 15000
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "eval_loss": 0.7423983812332153,
+      "eval_runtime": 61.1091,
+      "eval_samples_per_second": 146.541,
+      "eval_steps_per_second": 18.328,
+      "step": 15000
+    },
+    {
+      "epoch": 1.1012014366192047,
+      "grad_norm": 0.8803513646125793,
+      "learning_rate": 0.00012814253371394823,
+      "loss": 0.7369,
+      "step": 15100
+    },
+    {
+      "epoch": 1.1084939198920711,
+      "grad_norm": 0.8555076122283936,
+      "learning_rate": 0.00012765035928733145,
+      "loss": 0.7414,
+      "step": 15200
+    },
+    {
+      "epoch": 1.1157864031649378,
+      "grad_norm": 0.8760358691215515,
+      "learning_rate": 0.00012715818486071464,
+      "loss": 0.741,
+      "step": 15300
+    },
+    {
+      "epoch": 1.1230788864378043,
+      "grad_norm": 0.8444579839706421,
+      "learning_rate": 0.00012666601043409784,
+      "loss": 0.7448,
+      "step": 15400
+    },
+    {
+      "epoch": 1.1303713697106708,
+      "grad_norm": 0.8995528221130371,
+      "learning_rate": 0.00012617383600748106,
+      "loss": 0.7436,
+      "step": 15500
+    },
+    {
+      "epoch": 1.1376638529835372,
+      "grad_norm": 0.8966475129127502,
+      "learning_rate": 0.00012568166158086427,
+      "loss": 0.7485,
+      "step": 15600
+    },
+    {
+      "epoch": 1.1449563362564037,
+      "grad_norm": 0.8527953028678894,
+      "learning_rate": 0.00012518948715424747,
+      "loss": 0.7303,
+      "step": 15700
+    },
+    {
+      "epoch": 1.1522488195292702,
+      "grad_norm": 0.8657513856887817,
+      "learning_rate": 0.00012469731272763069,
+      "loss": 0.7431,
+      "step": 15800
+    },
+    {
+      "epoch": 1.1595413028021366,
+      "grad_norm": 0.8745185136795044,
+      "learning_rate": 0.00012420513830101388,
+      "loss": 0.7426,
+      "step": 15900
+    },
+    {
+      "epoch": 1.166833786075003,
+      "grad_norm": 0.8729378581047058,
+      "learning_rate": 0.0001237129638743971,
+      "loss": 0.7389,
+      "step": 16000
+    },
+    {
+      "epoch": 1.166833786075003,
+      "eval_loss": 0.740699291229248,
+      "eval_runtime": 60.635,
+      "eval_samples_per_second": 147.687,
+      "eval_steps_per_second": 18.471,
+      "step": 16000
+    },
+    {
+      "epoch": 1.1741262693478696,
+      "grad_norm": 0.8877021670341492,
+      "learning_rate": 0.00012322078944778032,
+      "loss": 0.7419,
+      "step": 16100
+    },
+    {
+      "epoch": 1.1814187526207363,
+      "grad_norm": 0.9095293283462524,
+      "learning_rate": 0.0001227286150211635,
+      "loss": 0.7365,
+      "step": 16200
+    },
+    {
+      "epoch": 1.1887112358936027,
+      "grad_norm": 0.8597880601882935,
+      "learning_rate": 0.0001222364405945467,
+      "loss": 0.7336,
+      "step": 16300
+    },
+    {
+      "epoch": 1.1960037191664692,
+      "grad_norm": 0.9574359059333801,
+      "learning_rate": 0.0001217442661679299,
+      "loss": 0.7394,
+      "step": 16400
+    },
+    {
+      "epoch": 1.2032962024393357,
+      "grad_norm": 0.8484875559806824,
+      "learning_rate": 0.00012125209174131314,
+      "loss": 0.7392,
+      "step": 16500
+    },
+    {
+      "epoch": 1.2105886857122021,
+      "grad_norm": 0.8847618699073792,
+      "learning_rate": 0.00012075991731469633,
+      "loss": 0.7427,
+      "step": 16600
+    },
+    {
+      "epoch": 1.2178811689850686,
+      "grad_norm": 0.8780632019042969,
+      "learning_rate": 0.00012026774288807954,
+      "loss": 0.7399,
+      "step": 16700
+    },
+    {
+      "epoch": 1.225173652257935,
+      "grad_norm": 0.8698965311050415,
+      "learning_rate": 0.00011977556846146274,
+      "loss": 0.7395,
+      "step": 16800
+    },
+    {
+      "epoch": 1.2324661355308015,
+      "grad_norm": 0.8717935085296631,
+      "learning_rate": 0.00011928339403484596,
+      "loss": 0.7404,
+      "step": 16900
+    },
+    {
+      "epoch": 1.239758618803668,
+      "grad_norm": 0.8375683426856995,
+      "learning_rate": 0.00011879121960822917,
+      "loss": 0.7405,
+      "step": 17000
+    },
+    {
+      "epoch": 1.239758618803668,
+      "eval_loss": 0.7371787428855896,
+      "eval_runtime": 60.9373,
+      "eval_samples_per_second": 146.954,
+      "eval_steps_per_second": 18.38,
+      "step": 17000
+    },
+    {
+      "epoch": 1.2470511020765347,
+      "grad_norm": 0.8756095170974731,
+      "learning_rate": 0.00011829904518161237,
+      "loss": 0.736,
+      "step": 17100
+    },
+    {
+      "epoch": 1.2543435853494012,
+      "grad_norm": 0.8513076901435852,
+      "learning_rate": 0.00011780687075499556,
+      "loss": 0.7399,
+      "step": 17200
+    },
+    {
+      "epoch": 1.2616360686222676,
+      "grad_norm": 0.8297843337059021,
+      "learning_rate": 0.0001173146963283788,
+      "loss": 0.7406,
+      "step": 17300
+    },
+    {
+      "epoch": 1.268928551895134,
+      "grad_norm": 0.8896269202232361,
+      "learning_rate": 0.00011682252190176199,
+      "loss": 0.7346,
+      "step": 17400
+    },
+    {
+      "epoch": 1.2762210351680006,
+      "grad_norm": 0.874168336391449,
+      "learning_rate": 0.0001163303474751452,
+      "loss": 0.736,
+      "step": 17500
+    },
+    {
+      "epoch": 1.283513518440867,
+      "grad_norm": 0.9101394414901733,
+      "learning_rate": 0.0001158381730485284,
+      "loss": 0.7376,
+      "step": 17600
+    },
+    {
+      "epoch": 1.2908060017137335,
+      "grad_norm": 0.9011333584785461,
+      "learning_rate": 0.00011534599862191162,
+      "loss": 0.7361,
+      "step": 17700
+    },
+    {
+      "epoch": 1.2980984849866002,
+      "grad_norm": 0.8839349746704102,
+      "learning_rate": 0.00011485382419529482,
+      "loss": 0.7373,
+      "step": 17800
+    },
+    {
+      "epoch": 1.3053909682594664,
+      "grad_norm": 0.830528974533081,
+      "learning_rate": 0.00011436164976867803,
+      "loss": 0.7336,
+      "step": 17900
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "grad_norm": 0.8777081370353699,
+      "learning_rate": 0.00011386947534206122,
+      "loss": 0.7379,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "eval_loss": 0.7359282970428467,
+      "eval_runtime": 60.8023,
+      "eval_samples_per_second": 147.281,
+      "eval_steps_per_second": 18.42,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3199759348051996,
+      "grad_norm": 0.8853510022163391,
+      "learning_rate": 0.00011337730091544443,
+      "loss": 0.7376,
+      "step": 18100
+    },
+    {
+      "epoch": 1.327268418078066,
+      "grad_norm": 0.9219810366630554,
+      "learning_rate": 0.00011288512648882766,
+      "loss": 0.7399,
+      "step": 18200
+    },
+    {
+      "epoch": 1.3345609013509325,
+      "grad_norm": 0.9233282208442688,
+      "learning_rate": 0.00011239295206221085,
+      "loss": 0.7399,
+      "step": 18300
+    },
+    {
+      "epoch": 1.341853384623799,
+      "grad_norm": 0.8359719514846802,
+      "learning_rate": 0.00011190077763559406,
+      "loss": 0.7366,
+      "step": 18400
+    },
+    {
+      "epoch": 1.3491458678966655,
+      "grad_norm": 0.8673479557037354,
+      "learning_rate": 0.00011140860320897726,
+      "loss": 0.7398,
+      "step": 18500
+    },
+    {
+      "epoch": 1.356438351169532,
+      "grad_norm": 0.8565610647201538,
+      "learning_rate": 0.00011091642878236048,
+      "loss": 0.7278,
+      "step": 18600
+    },
+    {
+      "epoch": 1.3637308344423986,
+      "grad_norm": 0.8547226190567017,
+      "learning_rate": 0.00011042425435574369,
+      "loss": 0.7381,
+      "step": 18700
+    },
+    {
+      "epoch": 1.3710233177152649,
+      "grad_norm": 0.897081732749939,
+      "learning_rate": 0.00010993207992912688,
+      "loss": 0.7339,
+      "step": 18800
+    },
+    {
+      "epoch": 1.3783158009881316,
+      "grad_norm": 0.8852410912513733,
+      "learning_rate": 0.00010943990550251008,
+      "loss": 0.7342,
+      "step": 18900
+    },
+    {
+      "epoch": 1.385608284260998,
+      "grad_norm": 0.9213690161705017,
+      "learning_rate": 0.00010894773107589332,
+      "loss": 0.7389,
+      "step": 19000
+    },
+    {
+      "epoch": 1.385608284260998,
+      "eval_loss": 0.7335625886917114,
+      "eval_runtime": 60.8231,
+      "eval_samples_per_second": 147.23,
+      "eval_steps_per_second": 18.414,
+      "step": 19000
+    },
+    {
+      "epoch": 1.3929007675338645,
+      "grad_norm": 0.8398423790931702,
+      "learning_rate": 0.00010845555664927651,
+      "loss": 0.7274,
+      "step": 19100
+    },
+    {
+      "epoch": 1.400193250806731,
+      "grad_norm": 0.8863806128501892,
+      "learning_rate": 0.00010796338222265971,
+      "loss": 0.7331,
+      "step": 19200
+    },
+    {
+      "epoch": 1.4074857340795974,
+      "grad_norm": 0.8836521506309509,
+      "learning_rate": 0.00010747120779604292,
+      "loss": 0.7334,
+      "step": 19300
+    },
+    {
+      "epoch": 1.414778217352464,
+      "grad_norm": 0.8278964757919312,
+      "learning_rate": 0.00010697903336942614,
+      "loss": 0.7281,
+      "step": 19400
+    },
+    {
+      "epoch": 1.4220707006253304,
+      "grad_norm": 0.8681420087814331,
+      "learning_rate": 0.00010648685894280934,
+      "loss": 0.7345,
+      "step": 19500
+    },
+    {
+      "epoch": 1.429363183898197,
+      "grad_norm": 0.8721694946289062,
+      "learning_rate": 0.00010599468451619255,
+      "loss": 0.7246,
+      "step": 19600
+    },
+    {
+      "epoch": 1.4366556671710633,
+      "grad_norm": 0.8880037665367126,
+      "learning_rate": 0.00010550251008957574,
+      "loss": 0.7321,
+      "step": 19700
+    },
+    {
+      "epoch": 1.44394815044393,
+      "grad_norm": 0.8522552251815796,
+      "learning_rate": 0.00010501033566295895,
+      "loss": 0.734,
+      "step": 19800
+    },
+    {
+      "epoch": 1.4512406337167965,
+      "grad_norm": 0.8816943168640137,
+      "learning_rate": 0.00010451816123634217,
+      "loss": 0.7333,
+      "step": 19900
+    },
+    {
+      "epoch": 1.458533116989663,
+      "grad_norm": 0.8068501949310303,
+      "learning_rate": 0.00010402598680972537,
+      "loss": 0.7267,
+      "step": 20000
+    },
+    {
+      "epoch": 1.458533116989663,
+      "eval_loss": 0.731645405292511,
+      "eval_runtime": 61.0998,
+      "eval_samples_per_second": 146.563,
+      "eval_steps_per_second": 18.331,
+      "step": 20000
+    },
+    {
+      "epoch": 1.4658256002625294,
+      "grad_norm": 0.8473337888717651,
+      "learning_rate": 0.00010353381238310858,
+      "loss": 0.7328,
+      "step": 20100
+    },
+    {
+      "epoch": 1.4731180835353959,
+      "grad_norm": 0.9009122252464294,
+      "learning_rate": 0.00010304163795649177,
+      "loss": 0.733,
+      "step": 20200
+    },
+    {
+      "epoch": 1.4804105668082623,
+      "grad_norm": 0.8225035667419434,
+      "learning_rate": 0.000102549463529875,
+      "loss": 0.7311,
+      "step": 20300
+    },
+    {
+      "epoch": 1.4877030500811288,
+      "grad_norm": 0.8552617430686951,
+      "learning_rate": 0.00010205728910325821,
+      "loss": 0.7282,
+      "step": 20400
+    },
+    {
+      "epoch": 1.4949955333539955,
+      "grad_norm": 0.8690235614776611,
+      "learning_rate": 0.0001015651146766414,
+      "loss": 0.7329,
+      "step": 20500
+    },
+    {
+      "epoch": 1.5022880166268617,
+      "grad_norm": 0.8566781878471375,
+      "learning_rate": 0.0001010729402500246,
+      "loss": 0.7358,
+      "step": 20600
+    },
+    {
+      "epoch": 1.5095804998997284,
+      "grad_norm": 0.9174933433532715,
+      "learning_rate": 0.00010058076582340782,
+      "loss": 0.7266,
+      "step": 20700
+    },
+    {
+      "epoch": 1.516872983172595,
+      "grad_norm": 0.9414506554603577,
+      "learning_rate": 0.00010008859139679103,
+      "loss": 0.7321,
+      "step": 20800
+    },
+    {
+      "epoch": 1.5241654664454614,
+      "grad_norm": 0.9433586001396179,
+      "learning_rate": 9.959641697017424e-05,
+      "loss": 0.7355,
+      "step": 20900
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "grad_norm": 0.8544315695762634,
+      "learning_rate": 9.910424254355744e-05,
+      "loss": 0.7313,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "eval_loss": 0.7285299301147461,
+      "eval_runtime": 60.6886,
+      "eval_samples_per_second": 147.557,
+      "eval_steps_per_second": 18.455,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5387504329911943,
+      "grad_norm": 0.893223762512207,
+      "learning_rate": 9.861206811694065e-05,
+      "loss": 0.7329,
+      "step": 21100
+    },
+    {
+      "epoch": 1.546042916264061,
+      "grad_norm": 0.8868634104728699,
+      "learning_rate": 9.811989369032387e-05,
+      "loss": 0.7276,
+      "step": 21200
+    },
+    {
+      "epoch": 1.5533353995369272,
+      "grad_norm": 0.8362566232681274,
+      "learning_rate": 9.762771926370706e-05,
+      "loss": 0.723,
+      "step": 21300
+    },
+    {
+      "epoch": 1.560627882809794,
+      "grad_norm": 0.8852083086967468,
+      "learning_rate": 9.713554483709026e-05,
+      "loss": 0.7281,
+      "step": 21400
+    },
+    {
+      "epoch": 1.5679203660826602,
+      "grad_norm": 0.8901813626289368,
+      "learning_rate": 9.664337041047348e-05,
+      "loss": 0.7307,
+      "step": 21500
+    },
+    {
+      "epoch": 1.5752128493555269,
+      "grad_norm": 0.8210172057151794,
+      "learning_rate": 9.615119598385667e-05,
+      "loss": 0.7245,
+      "step": 21600
+    },
+    {
+      "epoch": 1.5825053326283933,
+      "grad_norm": 0.8676414489746094,
+      "learning_rate": 9.56590215572399e-05,
+      "loss": 0.7294,
+      "step": 21700
+    },
+    {
+      "epoch": 1.5897978159012598,
+      "grad_norm": 0.8923740983009338,
+      "learning_rate": 9.51668471306231e-05,
+      "loss": 0.7242,
+      "step": 21800
+    },
+    {
+      "epoch": 1.5970902991741263,
+      "grad_norm": 0.8402920365333557,
+      "learning_rate": 9.46746727040063e-05,
+      "loss": 0.7258,
+      "step": 21900
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "grad_norm": 0.8525983691215515,
+      "learning_rate": 9.418249827738951e-05,
+      "loss": 0.7294,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "eval_loss": 0.7267495393753052,
+      "eval_runtime": 61.1086,
+      "eval_samples_per_second": 146.542,
+      "eval_steps_per_second": 18.328,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6116752657198594,
+      "grad_norm": 0.8605002164840698,
+      "learning_rate": 9.369032385077272e-05,
+      "loss": 0.7259,
+      "step": 22100
+    },
+    {
+      "epoch": 1.6189677489927257,
+      "grad_norm": 0.8606895208358765,
+      "learning_rate": 9.319814942415592e-05,
+      "loss": 0.7275,
+      "step": 22200
+    },
+    {
+      "epoch": 1.6262602322655924,
+      "grad_norm": 0.8824227452278137,
+      "learning_rate": 9.270597499753914e-05,
+      "loss": 0.7245,
+      "step": 22300
+    },
+    {
+      "epoch": 1.6335527155384586,
+      "grad_norm": 0.8670118451118469,
+      "learning_rate": 9.221380057092233e-05,
+      "loss": 0.719,
+      "step": 22400
+    },
+    {
+      "epoch": 1.6408451988113253,
+      "grad_norm": 0.92063307762146,
+      "learning_rate": 9.172162614430555e-05,
+      "loss": 0.7293,
+      "step": 22500
+    },
+    {
+      "epoch": 1.6481376820841918,
+      "grad_norm": 0.8425260782241821,
+      "learning_rate": 9.122945171768876e-05,
+      "loss": 0.728,
+      "step": 22600
+    },
+    {
+      "epoch": 1.6554301653570582,
+      "grad_norm": 0.9162302017211914,
+      "learning_rate": 9.073727729107196e-05,
+      "loss": 0.7265,
+      "step": 22700
+    },
+    {
+      "epoch": 1.6627226486299247,
+      "grad_norm": 0.8905067443847656,
+      "learning_rate": 9.024510286445517e-05,
+      "loss": 0.7256,
+      "step": 22800
+    },
+    {
+      "epoch": 1.6700151319027912,
+      "grad_norm": 0.874357283115387,
+      "learning_rate": 8.975292843783837e-05,
+      "loss": 0.7249,
+      "step": 22900
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "grad_norm": 0.842005729675293,
+      "learning_rate": 8.926075401122158e-05,
+      "loss": 0.7268,
+      "step": 23000
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "eval_loss": 0.7241798639297485,
+      "eval_runtime": 60.7958,
+      "eval_samples_per_second": 147.296,
+      "eval_steps_per_second": 18.422,
+      "step": 23000
+    },
+    {
+      "epoch": 1.684600098448524,
+      "grad_norm": 0.8695193529129028,
+      "learning_rate": 8.876857958460478e-05,
+      "loss": 0.7262,
+      "step": 23100
+    },
+    {
+      "epoch": 1.6918925817213908,
+      "grad_norm": 0.8673058748245239,
+      "learning_rate": 8.827640515798799e-05,
+      "loss": 0.7303,
+      "step": 23200
+    },
+    {
+      "epoch": 1.699185064994257,
+      "grad_norm": 0.9276596307754517,
+      "learning_rate": 8.77842307313712e-05,
+      "loss": 0.729,
+      "step": 23300
+    },
+    {
+      "epoch": 1.7064775482671237,
+      "grad_norm": 0.8023722171783447,
+      "learning_rate": 8.729205630475441e-05,
+      "loss": 0.7212,
+      "step": 23400
+    },
+    {
+      "epoch": 1.7137700315399902,
+      "grad_norm": 0.910897433757782,
+      "learning_rate": 8.67998818781376e-05,
+      "loss": 0.7252,
+      "step": 23500
+    },
+    {
+      "epoch": 1.7210625148128567,
+      "grad_norm": 0.8714926838874817,
+      "learning_rate": 8.630770745152083e-05,
+      "loss": 0.7306,
+      "step": 23600
+    },
+    {
+      "epoch": 1.7283549980857231,
+      "grad_norm": 0.8875166773796082,
+      "learning_rate": 8.581553302490403e-05,
+      "loss": 0.7235,
+      "step": 23700
+    },
+    {
+      "epoch": 1.7356474813585896,
+      "grad_norm": 0.9132345914840698,
+      "learning_rate": 8.532335859828724e-05,
+      "loss": 0.7331,
+      "step": 23800
+    },
+    {
+      "epoch": 1.7429399646314563,
+      "grad_norm": 0.8562710285186768,
+      "learning_rate": 8.483118417167044e-05,
+      "loss": 0.7282,
+      "step": 23900
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "grad_norm": 0.867508590221405,
+      "learning_rate": 8.433900974505365e-05,
+      "loss": 0.7256,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "eval_loss": 0.7232645153999329,
+      "eval_runtime": 60.377,
+      "eval_samples_per_second": 148.318,
+      "eval_steps_per_second": 18.55,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7575249311771892,
+      "grad_norm": 0.8258200287818909,
+      "learning_rate": 8.384683531843685e-05,
+      "loss": 0.7254,
+      "step": 24100
+    },
+    {
+      "epoch": 1.7648174144500555,
+      "grad_norm": 0.9109018445014954,
+      "learning_rate": 8.335466089182007e-05,
+      "loss": 0.7315,
+      "step": 24200
+    },
+    {
+      "epoch": 1.7721098977229222,
+      "grad_norm": 0.8500842452049255,
+      "learning_rate": 8.286248646520326e-05,
+      "loss": 0.7265,
+      "step": 24300
+    },
+    {
+      "epoch": 1.7794023809957886,
+      "grad_norm": 0.9286713600158691,
+      "learning_rate": 8.237031203858648e-05,
+      "loss": 0.7247,
+      "step": 24400
+    },
+    {
+      "epoch": 1.786694864268655,
+      "grad_norm": 0.8746926188468933,
+      "learning_rate": 8.187813761196969e-05,
+      "loss": 0.7261,
+      "step": 24500
+    },
+    {
+      "epoch": 1.7939873475415216,
+      "grad_norm": 0.8702288866043091,
+      "learning_rate": 8.13859631853529e-05,
+      "loss": 0.7207,
+      "step": 24600
+    },
+    {
+      "epoch": 1.801279830814388,
+      "grad_norm": 0.9746344089508057,
+      "learning_rate": 8.08937887587361e-05,
+      "loss": 0.728,
+      "step": 24700
+    },
+    {
+      "epoch": 1.8085723140872547,
+      "grad_norm": 0.8815904259681702,
+      "learning_rate": 8.04016143321193e-05,
+      "loss": 0.7174,
+      "step": 24800
+    },
+    {
+      "epoch": 1.815864797360121,
+      "grad_norm": 0.870474100112915,
+      "learning_rate": 7.990943990550251e-05,
+      "loss": 0.7316,
+      "step": 24900
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "grad_norm": 0.8451401591300964,
+      "learning_rate": 7.941726547888572e-05,
+      "loss": 0.7202,
+      "step": 25000
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "eval_loss": 0.721147358417511,
+      "eval_runtime": 60.8906,
+      "eval_samples_per_second": 147.067,
+      "eval_steps_per_second": 18.394,
+      "step": 25000
+    },
+    {
+      "epoch": 1.830449763905854,
+      "grad_norm": 0.8878180980682373,
+      "learning_rate": 7.892509105226894e-05,
+      "loss": 0.7236,
+      "step": 25100
+    },
+    {
+      "epoch": 1.8377422471787206,
+      "grad_norm": 0.859920859336853,
+      "learning_rate": 7.843291662565213e-05,
+      "loss": 0.7257,
+      "step": 25200
+    },
+    {
+      "epoch": 1.845034730451587,
+      "grad_norm": 0.9358228445053101,
+      "learning_rate": 7.794074219903535e-05,
+      "loss": 0.7175,
+      "step": 25300
+    },
+    {
+      "epoch": 1.8523272137244535,
+      "grad_norm": 0.858906626701355,
+      "learning_rate": 7.744856777241854e-05,
+      "loss": 0.7217,
+      "step": 25400
+    },
+    {
+      "epoch": 1.85961969699732,
+      "grad_norm": 0.9508287310600281,
+      "learning_rate": 7.695639334580176e-05,
+      "loss": 0.7211,
+      "step": 25500
+    },
+    {
+      "epoch": 1.8669121802701865,
+      "grad_norm": 0.9340062141418457,
+      "learning_rate": 7.646421891918496e-05,
+      "loss": 0.7254,
+      "step": 25600
+    },
+    {
+      "epoch": 1.8742046635430532,
+      "grad_norm": 0.9350687861442566,
+      "learning_rate": 7.597204449256817e-05,
+      "loss": 0.7247,
+      "step": 25700
+    },
+    {
+      "epoch": 1.8814971468159194,
+      "grad_norm": 0.9614841938018799,
+      "learning_rate": 7.547987006595137e-05,
+      "loss": 0.7283,
+      "step": 25800
+    },
+    {
+      "epoch": 1.888789630088786,
+      "grad_norm": 0.848640501499176,
+      "learning_rate": 7.49876956393346e-05,
+      "loss": 0.7221,
+      "step": 25900
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "grad_norm": 0.8105534315109253,
+      "learning_rate": 7.449552121271779e-05,
+      "loss": 0.7205,
+      "step": 26000
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "eval_loss": 0.7193262577056885,
+      "eval_runtime": 61.1614,
+      "eval_samples_per_second": 146.416,
+      "eval_steps_per_second": 18.312,
+      "step": 26000
+    },
+    {
+      "epoch": 1.903374596634519,
+      "grad_norm": 0.8522207736968994,
+      "learning_rate": 7.4003346786101e-05,
+      "loss": 0.7223,
+      "step": 26100
+    },
+    {
+      "epoch": 1.9106670799073855,
+      "grad_norm": 0.8983740210533142,
+      "learning_rate": 7.351117235948421e-05,
+      "loss": 0.7208,
+      "step": 26200
+    },
+    {
+      "epoch": 1.917959563180252,
+      "grad_norm": 0.8596473336219788,
+      "learning_rate": 7.301899793286742e-05,
+      "loss": 0.7184,
+      "step": 26300
+    },
+    {
+      "epoch": 1.9252520464531184,
+      "grad_norm": 0.9175098538398743,
+      "learning_rate": 7.252682350625062e-05,
+      "loss": 0.7213,
+      "step": 26400
+    },
+    {
+      "epoch": 1.932544529725985,
+      "grad_norm": 0.8626872897148132,
+      "learning_rate": 7.203464907963383e-05,
+      "loss": 0.7242,
+      "step": 26500
+    },
+    {
+      "epoch": 1.9398370129988516,
+      "grad_norm": 0.859780490398407,
+      "learning_rate": 7.154247465301703e-05,
+      "loss": 0.7197,
+      "step": 26600
+    },
+    {
+      "epoch": 1.9471294962717178,
+      "grad_norm": 0.8713703751564026,
+      "learning_rate": 7.105030022640024e-05,
+      "loss": 0.7231,
+      "step": 26700
+    },
+    {
+      "epoch": 1.9544219795445845,
+      "grad_norm": 0.8976535797119141,
+      "learning_rate": 7.055812579978344e-05,
+      "loss": 0.7233,
+      "step": 26800
+    },
+    {
+      "epoch": 1.9617144628174508,
+      "grad_norm": 0.9257802367210388,
+      "learning_rate": 7.006595137316665e-05,
+      "loss": 0.7221,
+      "step": 26900
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "grad_norm": 0.8592785596847534,
+      "learning_rate": 6.957377694654987e-05,
+      "loss": 0.7168,
+      "step": 27000
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "eval_loss": 0.7180259227752686,
+      "eval_runtime": 60.5352,
+      "eval_samples_per_second": 147.931,
+      "eval_steps_per_second": 18.502,
+      "step": 27000
+    },
+    {
+      "epoch": 1.976299429363184,
+      "grad_norm": 0.8931472897529602,
+      "learning_rate": 6.908160251993306e-05,
+      "loss": 0.7204,
+      "step": 27100
+    },
+    {
+      "epoch": 1.9835919126360504,
+      "grad_norm": 0.8821597695350647,
+      "learning_rate": 6.858942809331628e-05,
+      "loss": 0.7163,
+      "step": 27200
+    },
+    {
+      "epoch": 1.9908843959089169,
+      "grad_norm": 0.8749621510505676,
+      "learning_rate": 6.809725366669948e-05,
+      "loss": 0.711,
+      "step": 27300
+    },
+    {
+      "epoch": 1.9981768791817833,
+      "grad_norm": 0.903332531452179,
+      "learning_rate": 6.760507924008269e-05,
+      "loss": 0.7176,
+      "step": 27400
+    },
+    {
+      "epoch": 2.005505824871014,
+      "grad_norm": 0.854773759841919,
+      "learning_rate": 6.71129048134659e-05,
+      "loss": 0.7187,
+      "step": 27500
+    },
+    {
+      "epoch": 2.0127983081438807,
+      "grad_norm": 0.9489893913269043,
+      "learning_rate": 6.66207303868491e-05,
+      "loss": 0.7096,
+      "step": 27600
+    },
+    {
+      "epoch": 2.020090791416747,
+      "grad_norm": 0.8944621682167053,
+      "learning_rate": 6.61285559602323e-05,
+      "loss": 0.7104,
+      "step": 27700
+    },
+    {
+      "epoch": 2.0273832746896137,
+      "grad_norm": 0.8567011952400208,
+      "learning_rate": 6.563638153361553e-05,
+      "loss": 0.7124,
+      "step": 27800
+    },
+    {
+      "epoch": 2.0346757579624803,
+      "grad_norm": 0.8737155199050903,
+      "learning_rate": 6.514420710699872e-05,
+      "loss": 0.7127,
+      "step": 27900
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "grad_norm": 0.8935887813568115,
+      "learning_rate": 6.465203268038194e-05,
+      "loss": 0.7122,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "eval_loss": 0.716705858707428,
+      "eval_runtime": 60.7739,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0492607245082133,
+      "grad_norm": 0.9452987313270569,
+      "learning_rate": 6.415985825376514e-05,
+      "loss": 0.7112,
+      "step": 28100
+    },
+    {
+      "epoch": 2.0565532077810795,
+      "grad_norm": 0.8650675415992737,
+      "learning_rate": 6.366768382714833e-05,
+      "loss": 0.7079,
+      "step": 28200
+    },
+    {
+      "epoch": 2.063845691053946,
+      "grad_norm": 0.8913034796714783,
+      "learning_rate": 6.317550940053155e-05,
+      "loss": 0.713,
+      "step": 28300
+    },
+    {
+      "epoch": 2.0711381743268125,
+      "grad_norm": 0.9072710275650024,
+      "learning_rate": 6.268333497391476e-05,
+      "loss": 0.7094,
+      "step": 28400
+    },
+    {
+      "epoch": 2.078430657599679,
+      "grad_norm": 0.854245126247406,
+      "learning_rate": 6.219116054729796e-05,
+      "loss": 0.7077,
+      "step": 28500
+    },
+    {
+      "epoch": 2.0857231408725454,
+      "grad_norm": 0.929263174533844,
+      "learning_rate": 6.169898612068117e-05,
+      "loss": 0.7086,
+      "step": 28600
+    },
+    {
+      "epoch": 2.093015624145412,
+      "grad_norm": 0.9356215596199036,
+      "learning_rate": 6.120681169406438e-05,
+      "loss": 0.7157,
+      "step": 28700
+    },
+    {
+      "epoch": 2.100308107418279,
+      "grad_norm": 0.9242870211601257,
+      "learning_rate": 6.071463726744758e-05,
+      "loss": 0.71,
+      "step": 28800
+    },
+    {
+      "epoch": 2.107600590691145,
+      "grad_norm": 0.9065095782279968,
+      "learning_rate": 6.022246284083079e-05,
+      "loss": 0.7095,
+      "step": 28900
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "grad_norm": 0.9081276059150696,
+      "learning_rate": 5.9730288414214e-05,
+      "loss": 0.7096,
+      "step": 29000
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "eval_loss": 0.7152244448661804,
+      "eval_runtime": 60.7986,
+      "eval_samples_per_second": 147.29,
+      "eval_steps_per_second": 18.421,
+      "step": 29000
+    },
+    {
+      "epoch": 2.122185557236878,
+      "grad_norm": 0.8326215744018555,
+      "learning_rate": 5.923811398759721e-05,
+      "loss": 0.7147,
+      "step": 29100
+    },
+    {
+      "epoch": 2.1294780405097447,
+      "grad_norm": 0.9274723529815674,
+      "learning_rate": 5.874593956098041e-05,
+      "loss": 0.7111,
+      "step": 29200
+    },
+    {
+      "epoch": 2.136770523782611,
+      "grad_norm": 0.8282331824302673,
+      "learning_rate": 5.825376513436362e-05,
+      "loss": 0.7137,
+      "step": 29300
+    },
+    {
+      "epoch": 2.1440630070554776,
+      "grad_norm": 0.9081612229347229,
+      "learning_rate": 5.776159070774683e-05,
+      "loss": 0.7115,
+      "step": 29400
+    },
+    {
+      "epoch": 2.151355490328344,
+      "grad_norm": 0.9531508684158325,
+      "learning_rate": 5.726941628113004e-05,
+      "loss": 0.708,
+      "step": 29500
+    },
+    {
+      "epoch": 2.1586479736012105,
+      "grad_norm": 0.9125275611877441,
+      "learning_rate": 5.677724185451324e-05,
+      "loss": 0.7123,
+      "step": 29600
+    },
+    {
+      "epoch": 2.165940456874077,
+      "grad_norm": 0.9363859295845032,
+      "learning_rate": 5.628506742789645e-05,
+      "loss": 0.7146,
+      "step": 29700
+    },
+    {
+      "epoch": 2.1732329401469435,
+      "grad_norm": 0.9164854884147644,
+      "learning_rate": 5.579289300127966e-05,
+      "loss": 0.7121,
+      "step": 29800
+    },
+    {
+      "epoch": 2.18052542341981,
+      "grad_norm": 0.941330075263977,
+      "learning_rate": 5.530071857466287e-05,
+      "loss": 0.7086,
+      "step": 29900
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "grad_norm": 0.9006567597389221,
+      "learning_rate": 5.480854414804607e-05,
+      "loss": 0.7097,
+      "step": 30000
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "eval_loss": 0.7143043875694275,
+      "eval_runtime": 61.0555,
+      "eval_samples_per_second": 146.67,
+      "eval_steps_per_second": 18.344,
+      "step": 30000
+    },
+    {
+      "epoch": 2.195110389965543,
+      "grad_norm": 0.8913944363594055,
+      "learning_rate": 5.431636972142927e-05,
+      "loss": 0.7066,
+      "step": 30100
+    },
+    {
+      "epoch": 2.2024028732384093,
+      "grad_norm": 0.9200546145439148,
+      "learning_rate": 5.3824195294812486e-05,
+      "loss": 0.7076,
+      "step": 30200
+    },
+    {
+      "epoch": 2.209695356511276,
+      "grad_norm": 0.924148440361023,
+      "learning_rate": 5.3332020868195684e-05,
+      "loss": 0.7058,
+      "step": 30300
+    },
+    {
+      "epoch": 2.2169878397841423,
+      "grad_norm": 0.922255277633667,
+      "learning_rate": 5.2839846441578897e-05,
+      "loss": 0.7108,
+      "step": 30400
+    },
+    {
+      "epoch": 2.224280323057009,
+      "grad_norm": 0.9039818644523621,
+      "learning_rate": 5.23476720149621e-05,
+      "loss": 0.7091,
+      "step": 30500
+    },
+    {
+      "epoch": 2.2315728063298756,
+      "grad_norm": 0.963845431804657,
+      "learning_rate": 5.1855497588345314e-05,
+      "loss": 0.7065,
+      "step": 30600
+    },
+    {
+      "epoch": 2.238865289602742,
+      "grad_norm": 0.8838880658149719,
+      "learning_rate": 5.136332316172851e-05,
+      "loss": 0.7113,
+      "step": 30700
+    },
+    {
+      "epoch": 2.2461577728756086,
+      "grad_norm": 0.9642555117607117,
+      "learning_rate": 5.0871148735111725e-05,
+      "loss": 0.7062,
+      "step": 30800
+    },
+    {
+      "epoch": 2.253450256148475,
+      "grad_norm": 0.9088276624679565,
+      "learning_rate": 5.037897430849493e-05,
+      "loss": 0.7071,
+      "step": 30900
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "grad_norm": 0.9083282351493835,
+      "learning_rate": 4.9886799881878137e-05,
+      "loss": 0.7126,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "eval_loss": 0.7129958868026733,
+      "eval_runtime": 60.7821,
+      "eval_samples_per_second": 147.33,
+      "eval_steps_per_second": 18.426,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2680352226942078,
+      "grad_norm": 0.886710524559021,
+      "learning_rate": 4.939462545526134e-05,
+      "loss": 0.7043,
+      "step": 31100
+    },
+    {
+      "epoch": 2.2753277059670745,
+      "grad_norm": 0.8600069880485535,
+      "learning_rate": 4.8902451028644554e-05,
+      "loss": 0.7074,
+      "step": 31200
+    },
+    {
+      "epoch": 2.2826201892399407,
+      "grad_norm": 0.8897703289985657,
+      "learning_rate": 4.841027660202776e-05,
+      "loss": 0.7068,
+      "step": 31300
+    },
+    {
+      "epoch": 2.2899126725128074,
+      "grad_norm": 0.8638718724250793,
+      "learning_rate": 4.7918102175410965e-05,
+      "loss": 0.7062,
+      "step": 31400
+    },
+    {
+      "epoch": 2.297205155785674,
+      "grad_norm": 0.8973529934883118,
+      "learning_rate": 4.742592774879418e-05,
+      "loss": 0.7073,
+      "step": 31500
+    },
+    {
+      "epoch": 2.3044976390585403,
+      "grad_norm": 0.9759765267372131,
+      "learning_rate": 4.693375332217738e-05,
+      "loss": 0.7087,
+      "step": 31600
+    },
+    {
+      "epoch": 2.311790122331407,
+      "grad_norm": 0.9061428904533386,
+      "learning_rate": 4.644157889556059e-05,
+      "loss": 0.708,
+      "step": 31700
+    },
+    {
+      "epoch": 2.3190826056042733,
+      "grad_norm": 0.8808257579803467,
+      "learning_rate": 4.5949404468943794e-05,
+      "loss": 0.7086,
+      "step": 31800
+    },
+    {
+      "epoch": 2.32637508887714,
+      "grad_norm": 0.9116071462631226,
+      "learning_rate": 4.545723004232701e-05,
+      "loss": 0.7118,
+      "step": 31900
+    },
+    {
+      "epoch": 2.333667572150006,
+      "grad_norm": 0.9131873846054077,
+      "learning_rate": 4.496505561571021e-05,
+      "loss": 0.7043,
+      "step": 32000
+    },
+    {
+      "epoch": 2.333667572150006,
+      "eval_loss": 0.7112506031990051,
+      "eval_runtime": 61.1535,
+      "eval_samples_per_second": 146.435,
+      "eval_steps_per_second": 18.315,
+      "step": 32000
+    },
+    {
+      "epoch": 2.340960055422873,
+      "grad_norm": 0.9860331416130066,
+      "learning_rate": 4.447288118909342e-05,
+      "loss": 0.7063,
+      "step": 32100
+    },
+    {
+      "epoch": 2.348252538695739,
+      "grad_norm": 0.933958888053894,
+      "learning_rate": 4.398070676247662e-05,
+      "loss": 0.708,
+      "step": 32200
+    },
+    {
+      "epoch": 2.355545021968606,
+      "grad_norm": 0.8994225859642029,
+      "learning_rate": 4.3488532335859836e-05,
+      "loss": 0.7089,
+      "step": 32300
+    },
+    {
+      "epoch": 2.3628375052414725,
+      "grad_norm": 0.9435915946960449,
+      "learning_rate": 4.299635790924304e-05,
+      "loss": 0.7057,
+      "step": 32400
+    },
+    {
+      "epoch": 2.3701299885143388,
+      "grad_norm": 0.888438880443573,
+      "learning_rate": 4.2504183482626247e-05,
+      "loss": 0.7012,
+      "step": 32500
+    },
+    {
+      "epoch": 2.3774224717872054,
+      "grad_norm": 0.8772885799407959,
+      "learning_rate": 4.201200905600945e-05,
+      "loss": 0.7071,
+      "step": 32600
+    },
+    {
+      "epoch": 2.3847149550600717,
+      "grad_norm": 0.9333481788635254,
+      "learning_rate": 4.151983462939266e-05,
+      "loss": 0.7095,
+      "step": 32700
+    },
+    {
+      "epoch": 2.3920074383329384,
+      "grad_norm": 0.9497707486152649,
+      "learning_rate": 4.102766020277586e-05,
+      "loss": 0.7115,
+      "step": 32800
+    },
+    {
+      "epoch": 2.3992999216058046,
+      "grad_norm": 0.9641472697257996,
+      "learning_rate": 4.053548577615907e-05,
+      "loss": 0.712,
+      "step": 32900
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "grad_norm": 0.8958153128623962,
+      "learning_rate": 4.004331134954228e-05,
+      "loss": 0.7035,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "eval_loss": 0.7100856304168701,
+      "eval_runtime": 61.2325,
+      "eval_samples_per_second": 146.246,
+      "eval_steps_per_second": 18.291,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4138848881515376,
+      "grad_norm": 0.8818393349647522,
+      "learning_rate": 3.9551136922925487e-05,
+      "loss": 0.7052,
+      "step": 33100
+    },
+    {
+      "epoch": 2.4211773714244043,
+      "grad_norm": 0.8973012566566467,
+      "learning_rate": 3.905896249630869e-05,
+      "loss": 0.706,
+      "step": 33200
+    },
+    {
+      "epoch": 2.428469854697271,
+      "grad_norm": 0.8582873344421387,
+      "learning_rate": 3.85667880696919e-05,
+      "loss": 0.7088,
+      "step": 33300
+    },
+    {
+      "epoch": 2.435762337970137,
+      "grad_norm": 0.9306252002716064,
+      "learning_rate": 3.807461364307511e-05,
+      "loss": 0.7062,
+      "step": 33400
+    },
+    {
+      "epoch": 2.443054821243004,
+      "grad_norm": 0.8586992025375366,
+      "learning_rate": 3.7582439216458315e-05,
+      "loss": 0.7086,
+      "step": 33500
+    },
+    {
+      "epoch": 2.45034730451587,
+      "grad_norm": 0.9076369404792786,
+      "learning_rate": 3.709026478984152e-05,
+      "loss": 0.7052,
+      "step": 33600
+    },
+    {
+      "epoch": 2.457639787788737,
+      "grad_norm": 0.8954334855079651,
+      "learning_rate": 3.6598090363224727e-05,
+      "loss": 0.7082,
+      "step": 33700
+    },
+    {
+      "epoch": 2.464932271061603,
+      "grad_norm": 0.9315345287322998,
+      "learning_rate": 3.610591593660794e-05,
+      "loss": 0.7058,
+      "step": 33800
+    },
+    {
+      "epoch": 2.4722247543344698,
+      "grad_norm": 0.9223620295524597,
+      "learning_rate": 3.5613741509991144e-05,
+      "loss": 0.6992,
+      "step": 33900
+    },
+    {
+      "epoch": 2.479517237607336,
+      "grad_norm": 0.9349290132522583,
+      "learning_rate": 3.512156708337435e-05,
+      "loss": 0.7084,
+      "step": 34000
+    },
+    {
+      "epoch": 2.479517237607336,
+      "eval_loss": 0.7087690234184265,
+      "eval_runtime": 60.8859,
+      "eval_samples_per_second": 147.078,
+      "eval_steps_per_second": 18.395,
+      "step": 34000
+    },
+    {
+      "epoch": 2.4868097208802027,
+      "grad_norm": 0.883210301399231,
+      "learning_rate": 3.462939265675756e-05,
+      "loss": 0.7061,
+      "step": 34100
+    },
+    {
+      "epoch": 2.4941022041530694,
+      "grad_norm": 0.920868456363678,
+      "learning_rate": 3.413721823014077e-05,
+      "loss": 0.7069,
+      "step": 34200
+    },
+    {
+      "epoch": 2.5013946874259356,
+      "grad_norm": 0.9177393913269043,
+      "learning_rate": 3.3645043803523966e-05,
+      "loss": 0.7071,
+      "step": 34300
+    },
+    {
+      "epoch": 2.5086871706988023,
+      "grad_norm": 0.9114101529121399,
+      "learning_rate": 3.315286937690717e-05,
+      "loss": 0.7072,
+      "step": 34400
+    },
+    {
+      "epoch": 2.5159796539716686,
+      "grad_norm": 0.9645174145698547,
+      "learning_rate": 3.2660694950290384e-05,
+      "loss": 0.7028,
+      "step": 34500
+    },
+    {
+      "epoch": 2.5232721372445353,
+      "grad_norm": 0.8982295989990234,
+      "learning_rate": 3.216852052367359e-05,
+      "loss": 0.7085,
+      "step": 34600
+    },
+    {
+      "epoch": 2.530564620517402,
+      "grad_norm": 0.8964338898658752,
+      "learning_rate": 3.1676346097056795e-05,
+      "loss": 0.7069,
+      "step": 34700
+    },
+    {
+      "epoch": 2.537857103790268,
+      "grad_norm": 0.9609666466712952,
+      "learning_rate": 3.118417167044001e-05,
+      "loss": 0.7057,
+      "step": 34800
+    },
+    {
+      "epoch": 2.5451495870631344,
+      "grad_norm": 0.9131038188934326,
+      "learning_rate": 3.069199724382321e-05,
+      "loss": 0.7031,
+      "step": 34900
+    },
+    {
+      "epoch": 2.552442070336001,
+      "grad_norm": 0.9127321839332581,
+      "learning_rate": 3.019982281720642e-05,
+      "loss": 0.6979,
+      "step": 35000
+    },
+    {
+      "epoch": 2.552442070336001,
+      "eval_loss": 0.7076790928840637,
+      "eval_runtime": 61.0966,
+      "eval_samples_per_second": 146.571,
+      "eval_steps_per_second": 18.332,
+      "step": 35000
+    },
+    {
+      "epoch": 2.559734553608868,
+      "grad_norm": 0.9567495584487915,
+      "learning_rate": 2.9707648390589628e-05,
+      "loss": 0.7053,
+      "step": 35100
+    },
+    {
+      "epoch": 2.567027036881734,
+      "grad_norm": 0.9740573763847351,
+      "learning_rate": 2.9215473963972833e-05,
+      "loss": 0.7077,
+      "step": 35200
+    },
+    {
+      "epoch": 2.5743195201546007,
+      "grad_norm": 0.8982974886894226,
+      "learning_rate": 2.8723299537356042e-05,
+      "loss": 0.6983,
+      "step": 35300
+    },
+    {
+      "epoch": 2.581612003427467,
+      "grad_norm": 1.0185188055038452,
+      "learning_rate": 2.8231125110739248e-05,
+      "loss": 0.7069,
+      "step": 35400
+    },
+    {
+      "epoch": 2.5889044867003337,
+      "grad_norm": 0.94049471616745,
+      "learning_rate": 2.7738950684122457e-05,
+      "loss": 0.7054,
+      "step": 35500
+    },
+    {
+      "epoch": 2.5961969699732004,
+      "grad_norm": 0.8923749923706055,
+      "learning_rate": 2.7246776257505662e-05,
+      "loss": 0.7015,
+      "step": 35600
+    },
+    {
+      "epoch": 2.6034894532460666,
+      "grad_norm": 0.9568887948989868,
+      "learning_rate": 2.675460183088887e-05,
+      "loss": 0.7025,
+      "step": 35700
+    },
+    {
+      "epoch": 2.610781936518933,
+      "grad_norm": 0.9106321334838867,
+      "learning_rate": 2.6262427404272077e-05,
+      "loss": 0.7049,
+      "step": 35800
+    },
+    {
+      "epoch": 2.6180744197917996,
+      "grad_norm": 0.9499268531799316,
+      "learning_rate": 2.5770252977655285e-05,
+      "loss": 0.7021,
+      "step": 35900
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "grad_norm": 0.8965421915054321,
+      "learning_rate": 2.5278078551038488e-05,
+      "loss": 0.7036,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "eval_loss": 0.7065343856811523,
+      "eval_runtime": 61.0446,
+      "eval_samples_per_second": 146.696,
+      "eval_steps_per_second": 18.347,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6326593863375325,
+      "grad_norm": 0.94576096534729,
+      "learning_rate": 2.4785904124421696e-05,
+      "loss": 0.71,
+      "step": 36100
+    },
+    {
+      "epoch": 2.639951869610399,
+      "grad_norm": 0.962692141532898,
+      "learning_rate": 2.4293729697804905e-05,
+      "loss": 0.6953,
+      "step": 36200
+    },
+    {
+      "epoch": 2.6472443528832654,
+      "grad_norm": 0.9457094669342041,
+      "learning_rate": 2.380155527118811e-05,
+      "loss": 0.7011,
+      "step": 36300
+    },
+    {
+      "epoch": 2.654536836156132,
+      "grad_norm": 0.9523045420646667,
+      "learning_rate": 2.330938084457132e-05,
+      "loss": 0.7093,
+      "step": 36400
+    },
+    {
+      "epoch": 2.661829319428999,
+      "grad_norm": 0.9255204796791077,
+      "learning_rate": 2.2817206417954522e-05,
+      "loss": 0.6979,
+      "step": 36500
+    },
+    {
+      "epoch": 2.669121802701865,
+      "grad_norm": 1.015286922454834,
+      "learning_rate": 2.232503199133773e-05,
+      "loss": 0.7044,
+      "step": 36600
+    },
+    {
+      "epoch": 2.6764142859747313,
+      "grad_norm": 0.8911315202713013,
+      "learning_rate": 2.1832857564720936e-05,
+      "loss": 0.7031,
+      "step": 36700
+    },
+    {
+      "epoch": 2.683706769247598,
+      "grad_norm": 0.9372689127922058,
+      "learning_rate": 2.1340683138104145e-05,
+      "loss": 0.7019,
+      "step": 36800
+    },
+    {
+      "epoch": 2.6909992525204647,
+      "grad_norm": 0.9245051145553589,
+      "learning_rate": 2.084850871148735e-05,
+      "loss": 0.7065,
+      "step": 36900
+    },
+    {
+      "epoch": 2.698291735793331,
+      "grad_norm": 0.917607843875885,
+      "learning_rate": 2.035633428487056e-05,
+      "loss": 0.7016,
+      "step": 37000
+    },
+    {
+      "epoch": 2.698291735793331,
+      "eval_loss": 0.7054994702339172,
+      "eval_runtime": 60.6541,
+      "eval_samples_per_second": 147.64,
+      "eval_steps_per_second": 18.465,
+      "step": 37000
+    },
+    {
+      "epoch": 2.7055842190661976,
+      "grad_norm": 0.9054610729217529,
+      "learning_rate": 1.9864159858253765e-05,
+      "loss": 0.7034,
+      "step": 37100
+    },
+    {
+      "epoch": 2.712876702339064,
+      "grad_norm": 0.960075855255127,
+      "learning_rate": 1.9371985431636974e-05,
+      "loss": 0.7097,
+      "step": 37200
+    },
+    {
+      "epoch": 2.7201691856119306,
+      "grad_norm": 0.9454420208930969,
+      "learning_rate": 1.887981100502018e-05,
+      "loss": 0.7046,
+      "step": 37300
+    },
+    {
+      "epoch": 2.7274616688847972,
+      "grad_norm": 0.8761453628540039,
+      "learning_rate": 1.8387636578403385e-05,
+      "loss": 0.7068,
+      "step": 37400
+    },
+    {
+      "epoch": 2.7347541521576635,
+      "grad_norm": 0.9231957793235779,
+      "learning_rate": 1.7895462151786594e-05,
+      "loss": 0.6983,
+      "step": 37500
+    },
+    {
+      "epoch": 2.7420466354305297,
+      "grad_norm": 0.8630309104919434,
+      "learning_rate": 1.74032877251698e-05,
+      "loss": 0.6984,
+      "step": 37600
+    },
+    {
+      "epoch": 2.7493391187033964,
+      "grad_norm": 0.9077728986740112,
+      "learning_rate": 1.691111329855301e-05,
+      "loss": 0.7097,
+      "step": 37700
+    },
+    {
+      "epoch": 2.756631601976263,
+      "grad_norm": 0.9849316477775574,
+      "learning_rate": 1.6418938871936214e-05,
+      "loss": 0.7025,
+      "step": 37800
+    },
+    {
+      "epoch": 2.7639240852491294,
+      "grad_norm": 0.9101927280426025,
+      "learning_rate": 1.5926764445319423e-05,
+      "loss": 0.7127,
+      "step": 37900
+    },
+    {
+      "epoch": 2.771216568521996,
+      "grad_norm": 0.9624613523483276,
+      "learning_rate": 1.543459001870263e-05,
+      "loss": 0.7038,
+      "step": 38000
+    },
+    {
+      "epoch": 2.771216568521996,
+      "eval_loss": 0.7042670845985413,
+      "eval_runtime": 60.6288,
+      "eval_samples_per_second": 147.702,
+      "eval_steps_per_second": 18.473,
+      "step": 38000
+    },
+    {
+      "epoch": 2.7785090517948623,
+      "grad_norm": 0.8926946520805359,
+      "learning_rate": 1.4942415592085838e-05,
+      "loss": 0.6955,
+      "step": 38100
+    },
+    {
+      "epoch": 2.785801535067729,
+      "grad_norm": 0.9353916645050049,
+      "learning_rate": 1.4450241165469041e-05,
+      "loss": 0.7003,
+      "step": 38200
+    },
+    {
+      "epoch": 2.7930940183405957,
+      "grad_norm": 0.9394625425338745,
+      "learning_rate": 1.3958066738852249e-05,
+      "loss": 0.6963,
+      "step": 38300
+    },
+    {
+      "epoch": 2.800386501613462,
+      "grad_norm": 0.8811284303665161,
+      "learning_rate": 1.3465892312235456e-05,
+      "loss": 0.7057,
+      "step": 38400
+    },
+    {
+      "epoch": 2.807678984886328,
+      "grad_norm": 0.9111167788505554,
+      "learning_rate": 1.2973717885618663e-05,
+      "loss": 0.6905,
+      "step": 38500
+    },
+    {
+      "epoch": 2.814971468159195,
+      "grad_norm": 0.9061198830604553,
+      "learning_rate": 1.248154345900187e-05,
+      "loss": 0.6966,
+      "step": 38600
+    },
+    {
+      "epoch": 2.8222639514320615,
+      "grad_norm": 0.917921781539917,
+      "learning_rate": 1.1989369032385078e-05,
+      "loss": 0.7055,
+      "step": 38700
+    },
+    {
+      "epoch": 2.829556434704928,
+      "grad_norm": 0.9210913777351379,
+      "learning_rate": 1.1497194605768285e-05,
+      "loss": 0.7004,
+      "step": 38800
+    },
+    {
+      "epoch": 2.8368489179777945,
+      "grad_norm": 0.9152899384498596,
+      "learning_rate": 1.1005020179151492e-05,
+      "loss": 0.7065,
+      "step": 38900
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "grad_norm": 0.9237668514251709,
+      "learning_rate": 1.05128457525347e-05,
+      "loss": 0.7027,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "eval_loss": 0.7034493088722229,
+      "eval_runtime": 60.6775,
+      "eval_samples_per_second": 147.583,
+      "eval_steps_per_second": 18.458,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8514338845235274,
+      "grad_norm": 0.9577778577804565,
+      "learning_rate": 1.0020671325917906e-05,
+      "loss": 0.7064,
+      "step": 39100
+    },
+    {
+      "epoch": 2.858726367796394,
+      "grad_norm": 0.9955913424491882,
+      "learning_rate": 9.528496899301114e-06,
+      "loss": 0.7017,
+      "step": 39200
+    },
+    {
+      "epoch": 2.8660188510692604,
+      "grad_norm": 0.9187660217285156,
+      "learning_rate": 9.03632247268432e-06,
+      "loss": 0.6998,
+      "step": 39300
+    },
+    {
+      "epoch": 2.8733113343421266,
+      "grad_norm": 0.9275550842285156,
+      "learning_rate": 8.544148046067526e-06,
+      "loss": 0.7002,
+      "step": 39400
+    },
+    {
+      "epoch": 2.8806038176149933,
+      "grad_norm": 0.9114721417427063,
+      "learning_rate": 8.051973619450734e-06,
+      "loss": 0.7027,
+      "step": 39500
+    },
+    {
+      "epoch": 2.88789630088786,
+      "grad_norm": 0.9408327341079712,
+      "learning_rate": 7.559799192833941e-06,
+      "loss": 0.7034,
+      "step": 39600
+    },
+    {
+      "epoch": 2.8951887841607262,
+      "grad_norm": 0.9538366198539734,
+      "learning_rate": 7.067624766217147e-06,
+      "loss": 0.7007,
+      "step": 39700
+    },
+    {
+      "epoch": 2.902481267433593,
+      "grad_norm": 0.923864483833313,
+      "learning_rate": 6.5754503396003544e-06,
+      "loss": 0.6972,
+      "step": 39800
+    },
+    {
+      "epoch": 2.909773750706459,
+      "grad_norm": 0.9156636595726013,
+      "learning_rate": 6.083275912983562e-06,
+      "loss": 0.7064,
+      "step": 39900
+    },
+    {
+      "epoch": 2.917066233979326,
+      "grad_norm": 0.9568312168121338,
+      "learning_rate": 5.591101486366768e-06,
+      "loss": 0.6969,
+      "step": 40000
+    },
+    {
+      "epoch": 2.917066233979326,
+      "eval_loss": 0.7027888894081116,
+      "eval_runtime": 61.1155,
+      "eval_samples_per_second": 146.526,
+      "eval_steps_per_second": 18.326,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 41136,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.07908337664e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-40000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
+size 5713

checkpoint-41000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-41000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:017b6d860b5fcff897c917d1a4d7a6873cc30ff17c4c60eb2ea531f2316f5089
+size 8676008

checkpoint-41000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c58566385e27a03801d627eadfb8c636afe591b8b5676a485b59a3c481f608a
+size 8878080

checkpoint-41000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceccadd7f022265a48e33d102c69d7418bb357fd78b83f973f75d540e8752845
+size 1465

checkpoint-41000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3231 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.989991066707991,
+  "eval_steps": 1000,
+  "global_step": 41000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007292483272866493,
+      "grad_norm": 2.1235318183898926,
+      "learning_rate": 4e-05,
+      "loss": 2.7429,
+      "step": 100
+    },
+    {
+      "epoch": 0.014584966545732986,
+      "grad_norm": 1.9533482789993286,
+      "learning_rate": 8e-05,
+      "loss": 1.4786,
+      "step": 200
+    },
+    {
+      "epoch": 0.02187744981859948,
+      "grad_norm": 1.5908012390136719,
+      "learning_rate": 0.00012,
+      "loss": 1.252,
+      "step": 300
+    },
+    {
+      "epoch": 0.029169933091465972,
+      "grad_norm": 1.592781662940979,
+      "learning_rate": 0.00016,
+      "loss": 1.1674,
+      "step": 400
+    },
+    {
+      "epoch": 0.036462416364332464,
+      "grad_norm": 1.4071415662765503,
+      "learning_rate": 0.0002,
+      "loss": 1.101,
+      "step": 500
+    },
+    {
+      "epoch": 0.04375489963719896,
+      "grad_norm": 1.4228886365890503,
+      "learning_rate": 0.0001995078255733832,
+      "loss": 1.0487,
+      "step": 600
+    },
+    {
+      "epoch": 0.05104738291006545,
+      "grad_norm": 1.2705847024917603,
+      "learning_rate": 0.00019901565114676642,
+      "loss": 1.0119,
+      "step": 700
+    },
+    {
+      "epoch": 0.058339866182931945,
+      "grad_norm": 1.1770137548446655,
+      "learning_rate": 0.00019852347672014964,
+      "loss": 0.9906,
+      "step": 800
+    },
+    {
+      "epoch": 0.06563234945579843,
+      "grad_norm": 1.1681164503097534,
+      "learning_rate": 0.00019803130229353283,
+      "loss": 0.9645,
+      "step": 900
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "grad_norm": 1.020504117012024,
+      "learning_rate": 0.00019753912786691605,
+      "loss": 0.9525,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "eval_loss": 0.9407642483711243,
+      "eval_runtime": 61.0906,
+      "eval_samples_per_second": 146.586,
+      "eval_steps_per_second": 18.333,
+      "step": 1000
+    },
+    {
+      "epoch": 0.08021731600153142,
+      "grad_norm": 1.079444408416748,
+      "learning_rate": 0.00019704695344029924,
+      "loss": 0.9414,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08750979927439792,
+      "grad_norm": 1.057377576828003,
+      "learning_rate": 0.00019655477901368246,
+      "loss": 0.9231,
+      "step": 1200
+    },
+    {
+      "epoch": 0.0948022825472644,
+      "grad_norm": 1.068018913269043,
+      "learning_rate": 0.00019606260458706568,
+      "loss": 0.9168,
+      "step": 1300
+    },
+    {
+      "epoch": 0.1020947658201309,
+      "grad_norm": 0.9460920095443726,
+      "learning_rate": 0.00019557043016044887,
+      "loss": 0.9031,
+      "step": 1400
+    },
+    {
+      "epoch": 0.1093872490929974,
+      "grad_norm": 1.056226134300232,
+      "learning_rate": 0.00019507825573383206,
+      "loss": 0.8901,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11667973236586389,
+      "grad_norm": 1.0429835319519043,
+      "learning_rate": 0.00019458608130721528,
+      "loss": 0.8928,
+      "step": 1600
+    },
+    {
+      "epoch": 0.12397221563873038,
+      "grad_norm": 1.050790548324585,
+      "learning_rate": 0.0001940939068805985,
+      "loss": 0.8803,
+      "step": 1700
+    },
+    {
+      "epoch": 0.13126469891159687,
+      "grad_norm": 0.9586555361747742,
+      "learning_rate": 0.0001936017324539817,
+      "loss": 0.8809,
+      "step": 1800
+    },
+    {
+      "epoch": 0.13855718218446336,
+      "grad_norm": 0.985379159450531,
+      "learning_rate": 0.00019310955802736491,
+      "loss": 0.8743,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "grad_norm": 0.9307010769844055,
+      "learning_rate": 0.00019261738360074813,
+      "loss": 0.8727,
+      "step": 2000
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "eval_loss": 0.86456698179245,
+      "eval_runtime": 60.6283,
+      "eval_samples_per_second": 147.703,
+      "eval_steps_per_second": 18.473,
+      "step": 2000
+    },
+    {
+      "epoch": 0.15314214873019635,
+      "grad_norm": 1.0384063720703125,
+      "learning_rate": 0.00019212520917413133,
+      "loss": 0.8742,
+      "step": 2100
+    },
+    {
+      "epoch": 0.16043463200306285,
+      "grad_norm": 0.9662402868270874,
+      "learning_rate": 0.00019163303474751452,
+      "loss": 0.8661,
+      "step": 2200
+    },
+    {
+      "epoch": 0.16772711527592934,
+      "grad_norm": 0.9773098230361938,
+      "learning_rate": 0.00019114086032089774,
+      "loss": 0.8576,
+      "step": 2300
+    },
+    {
+      "epoch": 0.17501959854879584,
+      "grad_norm": 0.9672012329101562,
+      "learning_rate": 0.00019064868589428093,
+      "loss": 0.8595,
+      "step": 2400
+    },
+    {
+      "epoch": 0.1823120818216623,
+      "grad_norm": 0.9758124351501465,
+      "learning_rate": 0.00019015651146766415,
+      "loss": 0.8524,
+      "step": 2500
+    },
+    {
+      "epoch": 0.1896045650945288,
+      "grad_norm": 0.972232460975647,
+      "learning_rate": 0.00018966433704104737,
+      "loss": 0.8468,
+      "step": 2600
+    },
+    {
+      "epoch": 0.1968970483673953,
+      "grad_norm": 0.9417553544044495,
+      "learning_rate": 0.00018917216261443056,
+      "loss": 0.8412,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2041895316402618,
+      "grad_norm": 0.9395071864128113,
+      "learning_rate": 0.00018867998818781375,
+      "loss": 0.8413,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2114820149131283,
+      "grad_norm": 0.9951208233833313,
+      "learning_rate": 0.000188187813761197,
+      "loss": 0.8345,
+      "step": 2900
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "grad_norm": 0.9656242728233337,
+      "learning_rate": 0.0001876956393345802,
+      "loss": 0.8317,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "eval_loss": 0.8318613767623901,
+      "eval_runtime": 61.1356,
+      "eval_samples_per_second": 146.478,
+      "eval_steps_per_second": 18.32,
+      "step": 3000
+    },
+    {
+      "epoch": 0.22606698145886128,
+      "grad_norm": 0.8810185194015503,
+      "learning_rate": 0.00018720346490796338,
+      "loss": 0.8321,
+      "step": 3100
+    },
+    {
+      "epoch": 0.23335946473172778,
+      "grad_norm": 0.9199262857437134,
+      "learning_rate": 0.0001867112904813466,
+      "loss": 0.8406,
+      "step": 3200
+    },
+    {
+      "epoch": 0.24065194800459427,
+      "grad_norm": 0.9557051658630371,
+      "learning_rate": 0.00018621911605472982,
+      "loss": 0.8277,
+      "step": 3300
+    },
+    {
+      "epoch": 0.24794443127746077,
+      "grad_norm": 0.9777804017066956,
+      "learning_rate": 0.000185726941628113,
+      "loss": 0.8272,
+      "step": 3400
+    },
+    {
+      "epoch": 0.25523691455032727,
+      "grad_norm": 0.8856322169303894,
+      "learning_rate": 0.00018523476720149623,
+      "loss": 0.8256,
+      "step": 3500
+    },
+    {
+      "epoch": 0.26252939782319373,
+      "grad_norm": 0.9196017980575562,
+      "learning_rate": 0.00018474259277487942,
+      "loss": 0.8234,
+      "step": 3600
+    },
+    {
+      "epoch": 0.26982188109606026,
+      "grad_norm": 0.9568464159965515,
+      "learning_rate": 0.00018425041834826264,
+      "loss": 0.8193,
+      "step": 3700
+    },
+    {
+      "epoch": 0.2771143643689267,
+      "grad_norm": 0.9552770256996155,
+      "learning_rate": 0.00018375824392164583,
+      "loss": 0.8179,
+      "step": 3800
+    },
+    {
+      "epoch": 0.28440684764179325,
+      "grad_norm": 0.8997077345848083,
+      "learning_rate": 0.00018326606949502905,
+      "loss": 0.8138,
+      "step": 3900
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "grad_norm": 0.8896480202674866,
+      "learning_rate": 0.00018277389506841224,
+      "loss": 0.8172,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "eval_loss": 0.8123040199279785,
+      "eval_runtime": 60.7914,
+      "eval_samples_per_second": 147.307,
+      "eval_steps_per_second": 18.424,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2989918141875262,
+      "grad_norm": 0.9520764350891113,
+      "learning_rate": 0.00018228172064179546,
+      "loss": 0.8183,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3062842974603927,
+      "grad_norm": 0.9373065233230591,
+      "learning_rate": 0.00018178954621517868,
+      "loss": 0.8132,
+      "step": 4200
+    },
+    {
+      "epoch": 0.3135767807332592,
+      "grad_norm": 0.8733066916465759,
+      "learning_rate": 0.00018129737178856187,
+      "loss": 0.811,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3208692640061257,
+      "grad_norm": 0.8866516351699829,
+      "learning_rate": 0.00018080519736194507,
+      "loss": 0.8093,
+      "step": 4400
+    },
+    {
+      "epoch": 0.32816174727899217,
+      "grad_norm": 0.9394953846931458,
+      "learning_rate": 0.00018031302293532828,
+      "loss": 0.8035,
+      "step": 4500
+    },
+    {
+      "epoch": 0.3354542305518587,
+      "grad_norm": 0.9133720993995667,
+      "learning_rate": 0.0001798208485087115,
+      "loss": 0.8054,
+      "step": 4600
+    },
+    {
+      "epoch": 0.34274671382472516,
+      "grad_norm": 0.9428606629371643,
+      "learning_rate": 0.0001793286740820947,
+      "loss": 0.8076,
+      "step": 4700
+    },
+    {
+      "epoch": 0.3500391970975917,
+      "grad_norm": 0.8996593356132507,
+      "learning_rate": 0.00017883649965547792,
+      "loss": 0.812,
+      "step": 4800
+    },
+    {
+      "epoch": 0.35733168037045815,
+      "grad_norm": 0.9113749265670776,
+      "learning_rate": 0.0001783443252288611,
+      "loss": 0.8048,
+      "step": 4900
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "grad_norm": 0.9185646176338196,
+      "learning_rate": 0.00017785215080224433,
+      "loss": 0.8023,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "eval_loss": 0.7973803877830505,
+      "eval_runtime": 60.8068,
+      "eval_samples_per_second": 147.27,
+      "eval_steps_per_second": 18.419,
+      "step": 5000
+    },
+    {
+      "epoch": 0.37191664691619114,
+      "grad_norm": 0.8994658589363098,
+      "learning_rate": 0.00017735997637562755,
+      "loss": 0.8089,
+      "step": 5100
+    },
+    {
+      "epoch": 0.3792091301890576,
+      "grad_norm": 0.8724523782730103,
+      "learning_rate": 0.00017686780194901074,
+      "loss": 0.8015,
+      "step": 5200
+    },
+    {
+      "epoch": 0.38650161346192413,
+      "grad_norm": 0.8285540342330933,
+      "learning_rate": 0.00017637562752239393,
+      "loss": 0.7944,
+      "step": 5300
+    },
+    {
+      "epoch": 0.3937940967347906,
+      "grad_norm": 0.8982509970664978,
+      "learning_rate": 0.00017588345309577718,
+      "loss": 0.7952,
+      "step": 5400
+    },
+    {
+      "epoch": 0.4010865800076571,
+      "grad_norm": 0.9266172051429749,
+      "learning_rate": 0.00017539127866916037,
+      "loss": 0.7978,
+      "step": 5500
+    },
+    {
+      "epoch": 0.4083790632805236,
+      "grad_norm": 0.901662290096283,
+      "learning_rate": 0.00017489910424254356,
+      "loss": 0.7966,
+      "step": 5600
+    },
+    {
+      "epoch": 0.4156715465533901,
+      "grad_norm": 0.9309051036834717,
+      "learning_rate": 0.00017440692981592678,
+      "loss": 0.7975,
+      "step": 5700
+    },
+    {
+      "epoch": 0.4229640298262566,
+      "grad_norm": 0.8789328336715698,
+      "learning_rate": 0.00017391475538930997,
+      "loss": 0.7997,
+      "step": 5800
+    },
+    {
+      "epoch": 0.4302565130991231,
+      "grad_norm": 0.8636139035224915,
+      "learning_rate": 0.0001734225809626932,
+      "loss": 0.7914,
+      "step": 5900
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "grad_norm": 0.9468287229537964,
+      "learning_rate": 0.00017293040653607638,
+      "loss": 0.7859,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "eval_loss": 0.7869976162910461,
+      "eval_runtime": 60.7741,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 6000
+    },
+    {
+      "epoch": 0.44484147964485604,
+      "grad_norm": 0.867158055305481,
+      "learning_rate": 0.0001724382321094596,
+      "loss": 0.7924,
+      "step": 6100
+    },
+    {
+      "epoch": 0.45213396291772256,
+      "grad_norm": 0.9379836320877075,
+      "learning_rate": 0.0001719460576828428,
+      "loss": 0.7902,
+      "step": 6200
+    },
+    {
+      "epoch": 0.45942644619058903,
+      "grad_norm": 0.8591951727867126,
+      "learning_rate": 0.000171453883256226,
+      "loss": 0.7926,
+      "step": 6300
+    },
+    {
+      "epoch": 0.46671892946345556,
+      "grad_norm": 0.9702317118644714,
+      "learning_rate": 0.00017096170882960923,
+      "loss": 0.7867,
+      "step": 6400
+    },
+    {
+      "epoch": 0.474011412736322,
+      "grad_norm": 0.902302086353302,
+      "learning_rate": 0.00017046953440299242,
+      "loss": 0.7897,
+      "step": 6500
+    },
+    {
+      "epoch": 0.48130389600918855,
+      "grad_norm": 0.889926552772522,
+      "learning_rate": 0.00016997735997637561,
+      "loss": 0.7857,
+      "step": 6600
+    },
+    {
+      "epoch": 0.488596379282055,
+      "grad_norm": 0.8906420469284058,
+      "learning_rate": 0.00016948518554975886,
+      "loss": 0.7878,
+      "step": 6700
+    },
+    {
+      "epoch": 0.49588886255492154,
+      "grad_norm": 0.919983983039856,
+      "learning_rate": 0.00016899301112314205,
+      "loss": 0.7876,
+      "step": 6800
+    },
+    {
+      "epoch": 0.5031813458277881,
+      "grad_norm": 0.8610624670982361,
+      "learning_rate": 0.00016850083669652524,
+      "loss": 0.7923,
+      "step": 6900
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "grad_norm": 0.9339637160301208,
+      "learning_rate": 0.00016800866226990846,
+      "loss": 0.7837,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "eval_loss": 0.7791191935539246,
+      "eval_runtime": 60.8878,
+      "eval_samples_per_second": 147.074,
+      "eval_steps_per_second": 18.395,
+      "step": 7000
+    },
+    {
+      "epoch": 0.517766312373521,
+      "grad_norm": 0.9073446393013,
+      "learning_rate": 0.00016751648784329168,
+      "loss": 0.7809,
+      "step": 7100
+    },
+    {
+      "epoch": 0.5250587956463875,
+      "grad_norm": 0.9348235726356506,
+      "learning_rate": 0.00016702431341667487,
+      "loss": 0.7793,
+      "step": 7200
+    },
+    {
+      "epoch": 0.5323512789192539,
+      "grad_norm": 0.9155163168907166,
+      "learning_rate": 0.0001665321389900581,
+      "loss": 0.7821,
+      "step": 7300
+    },
+    {
+      "epoch": 0.5396437621921205,
+      "grad_norm": 0.9328250885009766,
+      "learning_rate": 0.00016603996456344129,
+      "loss": 0.7806,
+      "step": 7400
+    },
+    {
+      "epoch": 0.546936245464987,
+      "grad_norm": 0.8911275863647461,
+      "learning_rate": 0.00016554779013682448,
+      "loss": 0.7782,
+      "step": 7500
+    },
+    {
+      "epoch": 0.5542287287378534,
+      "grad_norm": 0.8989250659942627,
+      "learning_rate": 0.00016505561571020772,
+      "loss": 0.779,
+      "step": 7600
+    },
+    {
+      "epoch": 0.5615212120107199,
+      "grad_norm": 0.8869723081588745,
+      "learning_rate": 0.00016456344128359092,
+      "loss": 0.7822,
+      "step": 7700
+    },
+    {
+      "epoch": 0.5688136952835865,
+      "grad_norm": 0.8631371259689331,
+      "learning_rate": 0.0001640712668569741,
+      "loss": 0.7768,
+      "step": 7800
+    },
+    {
+      "epoch": 0.576106178556453,
+      "grad_norm": 0.8868420720100403,
+      "learning_rate": 0.00016357909243035733,
+      "loss": 0.7834,
+      "step": 7900
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "grad_norm": 0.9253202080726624,
+      "learning_rate": 0.00016308691800374055,
+      "loss": 0.773,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "eval_loss": 0.7733862400054932,
+      "eval_runtime": 60.8911,
+      "eval_samples_per_second": 147.066,
+      "eval_steps_per_second": 18.394,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5906911451021859,
+      "grad_norm": 0.830760657787323,
+      "learning_rate": 0.00016259474357712374,
+      "loss": 0.7756,
+      "step": 8100
+    },
+    {
+      "epoch": 0.5979836283750524,
+      "grad_norm": 0.9371838569641113,
+      "learning_rate": 0.00016210256915050696,
+      "loss": 0.776,
+      "step": 8200
+    },
+    {
+      "epoch": 0.605276111647919,
+      "grad_norm": 0.8486947417259216,
+      "learning_rate": 0.00016161039472389015,
+      "loss": 0.7758,
+      "step": 8300
+    },
+    {
+      "epoch": 0.6125685949207854,
+      "grad_norm": 0.8888623118400574,
+      "learning_rate": 0.00016111822029727337,
+      "loss": 0.783,
+      "step": 8400
+    },
+    {
+      "epoch": 0.6198610781936519,
+      "grad_norm": 0.9176976084709167,
+      "learning_rate": 0.00016062604587065656,
+      "loss": 0.7782,
+      "step": 8500
+    },
+    {
+      "epoch": 0.6271535614665184,
+      "grad_norm": 0.90993732213974,
+      "learning_rate": 0.00016013387144403978,
+      "loss": 0.7741,
+      "step": 8600
+    },
+    {
+      "epoch": 0.6344460447393849,
+      "grad_norm": 0.8461544513702393,
+      "learning_rate": 0.00015964169701742297,
+      "loss": 0.7782,
+      "step": 8700
+    },
+    {
+      "epoch": 0.6417385280122514,
+      "grad_norm": 0.8642047643661499,
+      "learning_rate": 0.0001591495225908062,
+      "loss": 0.7706,
+      "step": 8800
+    },
+    {
+      "epoch": 0.6490310112851179,
+      "grad_norm": 0.8944571018218994,
+      "learning_rate": 0.0001586573481641894,
+      "loss": 0.7727,
+      "step": 8900
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "grad_norm": 0.9075286984443665,
+      "learning_rate": 0.0001581651737375726,
+      "loss": 0.7748,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "eval_loss": 0.7666329741477966,
+      "eval_runtime": 60.5924,
+      "eval_samples_per_second": 147.791,
+      "eval_steps_per_second": 18.484,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6636159778308508,
+      "grad_norm": 0.9164955615997314,
+      "learning_rate": 0.0001576729993109558,
+      "loss": 0.7792,
+      "step": 9100
+    },
+    {
+      "epoch": 0.6709084611037174,
+      "grad_norm": 0.8446054458618164,
+      "learning_rate": 0.000157180824884339,
+      "loss": 0.7661,
+      "step": 9200
+    },
+    {
+      "epoch": 0.6782009443765838,
+      "grad_norm": 0.8793991804122925,
+      "learning_rate": 0.00015668865045772223,
+      "loss": 0.7678,
+      "step": 9300
+    },
+    {
+      "epoch": 0.6854934276494503,
+      "grad_norm": 0.8772592544555664,
+      "learning_rate": 0.00015619647603110542,
+      "loss": 0.7708,
+      "step": 9400
+    },
+    {
+      "epoch": 0.6927859109223168,
+      "grad_norm": 0.854118824005127,
+      "learning_rate": 0.00015570430160448864,
+      "loss": 0.7616,
+      "step": 9500
+    },
+    {
+      "epoch": 0.7000783941951834,
+      "grad_norm": 0.8653910756111145,
+      "learning_rate": 0.00015521212717787183,
+      "loss": 0.767,
+      "step": 9600
+    },
+    {
+      "epoch": 0.7073708774680498,
+      "grad_norm": 0.8890120387077332,
+      "learning_rate": 0.00015471995275125505,
+      "loss": 0.7657,
+      "step": 9700
+    },
+    {
+      "epoch": 0.7146633607409163,
+      "grad_norm": 0.8451828360557556,
+      "learning_rate": 0.00015422777832463827,
+      "loss": 0.7656,
+      "step": 9800
+    },
+    {
+      "epoch": 0.7219558440137828,
+      "grad_norm": 0.9029329419136047,
+      "learning_rate": 0.00015373560389802146,
+      "loss": 0.7749,
+      "step": 9900
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "grad_norm": 0.8538834452629089,
+      "learning_rate": 0.00015324342947140466,
+      "loss": 0.763,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "eval_loss": 0.76123046875,
+      "eval_runtime": 60.847,
+      "eval_samples_per_second": 147.172,
+      "eval_steps_per_second": 18.407,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7365408105595158,
+      "grad_norm": 0.8594367504119873,
+      "learning_rate": 0.00015275125504478788,
+      "loss": 0.7693,
+      "step": 10100
+    },
+    {
+      "epoch": 0.7438332938323823,
+      "grad_norm": 0.8748040199279785,
+      "learning_rate": 0.0001522590806181711,
+      "loss": 0.7684,
+      "step": 10200
+    },
+    {
+      "epoch": 0.7511257771052487,
+      "grad_norm": 0.9177483320236206,
+      "learning_rate": 0.0001517669061915543,
+      "loss": 0.7599,
+      "step": 10300
+    },
+    {
+      "epoch": 0.7584182603781152,
+      "grad_norm": 0.8988757729530334,
+      "learning_rate": 0.0001512747317649375,
+      "loss": 0.7648,
+      "step": 10400
+    },
+    {
+      "epoch": 0.7657107436509818,
+      "grad_norm": 0.8735676407814026,
+      "learning_rate": 0.00015078255733832073,
+      "loss": 0.7656,
+      "step": 10500
+    },
+    {
+      "epoch": 0.7730032269238483,
+      "grad_norm": 0.8750614523887634,
+      "learning_rate": 0.00015029038291170392,
+      "loss": 0.7632,
+      "step": 10600
+    },
+    {
+      "epoch": 0.7802957101967147,
+      "grad_norm": 0.8786306381225586,
+      "learning_rate": 0.0001497982084850871,
+      "loss": 0.7659,
+      "step": 10700
+    },
+    {
+      "epoch": 0.7875881934695812,
+      "grad_norm": 0.811834990978241,
+      "learning_rate": 0.00014930603405847033,
+      "loss": 0.7652,
+      "step": 10800
+    },
+    {
+      "epoch": 0.7948806767424477,
+      "grad_norm": 0.8844282031059265,
+      "learning_rate": 0.00014881385963185352,
+      "loss": 0.7623,
+      "step": 10900
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "grad_norm": 0.8444844484329224,
+      "learning_rate": 0.00014832168520523674,
+      "loss": 0.7622,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "eval_loss": 0.75812828540802,
+      "eval_runtime": 60.7569,
+      "eval_samples_per_second": 147.391,
+      "eval_steps_per_second": 18.434,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8094656432881807,
+      "grad_norm": 0.8396947979927063,
+      "learning_rate": 0.00014782951077861996,
+      "loss": 0.7673,
+      "step": 11100
+    },
+    {
+      "epoch": 0.8167581265610472,
+      "grad_norm": 0.8890758752822876,
+      "learning_rate": 0.00014733733635200315,
+      "loss": 0.7551,
+      "step": 11200
+    },
+    {
+      "epoch": 0.8240506098339136,
+      "grad_norm": 0.8038908839225769,
+      "learning_rate": 0.00014684516192538634,
+      "loss": 0.7612,
+      "step": 11300
+    },
+    {
+      "epoch": 0.8313430931067802,
+      "grad_norm": 0.8224745392799377,
+      "learning_rate": 0.0001463529874987696,
+      "loss": 0.7618,
+      "step": 11400
+    },
+    {
+      "epoch": 0.8386355763796467,
+      "grad_norm": 0.8691264390945435,
+      "learning_rate": 0.00014586081307215278,
+      "loss": 0.7618,
+      "step": 11500
+    },
+    {
+      "epoch": 0.8459280596525132,
+      "grad_norm": 0.8442777395248413,
+      "learning_rate": 0.00014536863864553597,
+      "loss": 0.7671,
+      "step": 11600
+    },
+    {
+      "epoch": 0.8532205429253796,
+      "grad_norm": 0.8520532846450806,
+      "learning_rate": 0.0001448764642189192,
+      "loss": 0.7625,
+      "step": 11700
+    },
+    {
+      "epoch": 0.8605130261982462,
+      "grad_norm": 0.908760666847229,
+      "learning_rate": 0.0001443842897923024,
+      "loss": 0.7615,
+      "step": 11800
+    },
+    {
+      "epoch": 0.8678055094711127,
+      "grad_norm": 0.8004080057144165,
+      "learning_rate": 0.0001438921153656856,
+      "loss": 0.7632,
+      "step": 11900
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "grad_norm": 0.8449864983558655,
+      "learning_rate": 0.00014339994093906882,
+      "loss": 0.7574,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "eval_loss": 0.752128005027771,
+      "eval_runtime": 61.1399,
+      "eval_samples_per_second": 146.467,
+      "eval_steps_per_second": 18.319,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8823904760168456,
+      "grad_norm": 0.8218274116516113,
+      "learning_rate": 0.00014290776651245201,
+      "loss": 0.7555,
+      "step": 12100
+    },
+    {
+      "epoch": 0.8896829592897121,
+      "grad_norm": 0.8944920897483826,
+      "learning_rate": 0.00014241559208583523,
+      "loss": 0.7594,
+      "step": 12200
+    },
+    {
+      "epoch": 0.8969754425625787,
+      "grad_norm": 0.9254937767982483,
+      "learning_rate": 0.00014192341765921845,
+      "loss": 0.7598,
+      "step": 12300
+    },
+    {
+      "epoch": 0.9042679258354451,
+      "grad_norm": 0.8887091875076294,
+      "learning_rate": 0.00014143124323260164,
+      "loss": 0.7625,
+      "step": 12400
+    },
+    {
+      "epoch": 0.9115604091083116,
+      "grad_norm": 0.8478124737739563,
+      "learning_rate": 0.00014093906880598484,
+      "loss": 0.756,
+      "step": 12500
+    },
+    {
+      "epoch": 0.9188528923811781,
+      "grad_norm": 0.9377927780151367,
+      "learning_rate": 0.00014044689437936805,
+      "loss": 0.7606,
+      "step": 12600
+    },
+    {
+      "epoch": 0.9261453756540446,
+      "grad_norm": 0.838175892829895,
+      "learning_rate": 0.00013995471995275127,
+      "loss": 0.7605,
+      "step": 12700
+    },
+    {
+      "epoch": 0.9334378589269111,
+      "grad_norm": 0.8345216512680054,
+      "learning_rate": 0.00013946254552613447,
+      "loss": 0.7568,
+      "step": 12800
+    },
+    {
+      "epoch": 0.9407303421997776,
+      "grad_norm": 0.894477367401123,
+      "learning_rate": 0.00013897037109951766,
+      "loss": 0.7535,
+      "step": 12900
+    },
+    {
+      "epoch": 0.948022825472644,
+      "grad_norm": 0.849010169506073,
+      "learning_rate": 0.00013847819667290088,
+      "loss": 0.7465,
+      "step": 13000
+    },
+    {
+      "epoch": 0.948022825472644,
+      "eval_loss": 0.7492165565490723,
+      "eval_runtime": 60.7079,
+      "eval_samples_per_second": 147.51,
+      "eval_steps_per_second": 18.449,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9553153087455105,
+      "grad_norm": 0.8754207491874695,
+      "learning_rate": 0.0001379860222462841,
+      "loss": 0.7576,
+      "step": 13100
+    },
+    {
+      "epoch": 0.9626077920183771,
+      "grad_norm": 0.8984807133674622,
+      "learning_rate": 0.0001374938478196673,
+      "loss": 0.7493,
+      "step": 13200
+    },
+    {
+      "epoch": 0.9699002752912436,
+      "grad_norm": 0.8458361029624939,
+      "learning_rate": 0.0001370016733930505,
+      "loss": 0.7468,
+      "step": 13300
+    },
+    {
+      "epoch": 0.97719275856411,
+      "grad_norm": 0.9169609546661377,
+      "learning_rate": 0.0001365094989664337,
+      "loss": 0.7515,
+      "step": 13400
+    },
+    {
+      "epoch": 0.9844852418369765,
+      "grad_norm": 0.8027638792991638,
+      "learning_rate": 0.00013601732453981692,
+      "loss": 0.7551,
+      "step": 13500
+    },
+    {
+      "epoch": 0.9917777251098431,
+      "grad_norm": 0.8572927117347717,
+      "learning_rate": 0.00013552515011320014,
+      "loss": 0.7481,
+      "step": 13600
+    },
+    {
+      "epoch": 0.9990702083827095,
+      "grad_norm": 0.8624053001403809,
+      "learning_rate": 0.00013503297568658333,
+      "loss": 0.7481,
+      "step": 13700
+    },
+    {
+      "epoch": 1.0063991540719404,
+      "grad_norm": 0.8915347456932068,
+      "learning_rate": 0.00013454080125996652,
+      "loss": 0.7463,
+      "step": 13800
+    },
+    {
+      "epoch": 1.0136916373448068,
+      "grad_norm": 0.8233557939529419,
+      "learning_rate": 0.00013404862683334977,
+      "loss": 0.7398,
+      "step": 13900
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "grad_norm": 0.8467598557472229,
+      "learning_rate": 0.00013355645240673296,
+      "loss": 0.7402,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "eval_loss": 0.7458442449569702,
+      "eval_runtime": 60.6887,
+      "eval_samples_per_second": 147.556,
+      "eval_steps_per_second": 18.455,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0282766038905398,
+      "grad_norm": 0.852739691734314,
+      "learning_rate": 0.00013306427798011615,
+      "loss": 0.7436,
+      "step": 14100
+    },
+    {
+      "epoch": 1.0355690871634062,
+      "grad_norm": 0.8501101136207581,
+      "learning_rate": 0.00013257210355349937,
+      "loss": 0.7472,
+      "step": 14200
+    },
+    {
+      "epoch": 1.0428615704362727,
+      "grad_norm": 0.8830447793006897,
+      "learning_rate": 0.0001320799291268826,
+      "loss": 0.7438,
+      "step": 14300
+    },
+    {
+      "epoch": 1.0501540537091394,
+      "grad_norm": 0.8827272057533264,
+      "learning_rate": 0.00013158775470026578,
+      "loss": 0.7439,
+      "step": 14400
+    },
+    {
+      "epoch": 1.0574465369820059,
+      "grad_norm": 0.7875618934631348,
+      "learning_rate": 0.000131095580273649,
+      "loss": 0.7426,
+      "step": 14500
+    },
+    {
+      "epoch": 1.0647390202548723,
+      "grad_norm": 0.9906949996948242,
+      "learning_rate": 0.0001306034058470322,
+      "loss": 0.7418,
+      "step": 14600
+    },
+    {
+      "epoch": 1.0720315035277388,
+      "grad_norm": 0.8803852200508118,
+      "learning_rate": 0.00013011123142041538,
+      "loss": 0.7421,
+      "step": 14700
+    },
+    {
+      "epoch": 1.0793239868006053,
+      "grad_norm": 0.8951194286346436,
+      "learning_rate": 0.0001296190569937986,
+      "loss": 0.7429,
+      "step": 14800
+    },
+    {
+      "epoch": 1.0866164700734717,
+      "grad_norm": 0.8548495769500732,
+      "learning_rate": 0.00012912688256718182,
+      "loss": 0.7462,
+      "step": 14900
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "grad_norm": 0.9326722025871277,
+      "learning_rate": 0.00012863470814056501,
+      "loss": 0.7515,
+      "step": 15000
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "eval_loss": 0.7423983812332153,
+      "eval_runtime": 61.1091,
+      "eval_samples_per_second": 146.541,
+      "eval_steps_per_second": 18.328,
+      "step": 15000
+    },
+    {
+      "epoch": 1.1012014366192047,
+      "grad_norm": 0.8803513646125793,
+      "learning_rate": 0.00012814253371394823,
+      "loss": 0.7369,
+      "step": 15100
+    },
+    {
+      "epoch": 1.1084939198920711,
+      "grad_norm": 0.8555076122283936,
+      "learning_rate": 0.00012765035928733145,
+      "loss": 0.7414,
+      "step": 15200
+    },
+    {
+      "epoch": 1.1157864031649378,
+      "grad_norm": 0.8760358691215515,
+      "learning_rate": 0.00012715818486071464,
+      "loss": 0.741,
+      "step": 15300
+    },
+    {
+      "epoch": 1.1230788864378043,
+      "grad_norm": 0.8444579839706421,
+      "learning_rate": 0.00012666601043409784,
+      "loss": 0.7448,
+      "step": 15400
+    },
+    {
+      "epoch": 1.1303713697106708,
+      "grad_norm": 0.8995528221130371,
+      "learning_rate": 0.00012617383600748106,
+      "loss": 0.7436,
+      "step": 15500
+    },
+    {
+      "epoch": 1.1376638529835372,
+      "grad_norm": 0.8966475129127502,
+      "learning_rate": 0.00012568166158086427,
+      "loss": 0.7485,
+      "step": 15600
+    },
+    {
+      "epoch": 1.1449563362564037,
+      "grad_norm": 0.8527953028678894,
+      "learning_rate": 0.00012518948715424747,
+      "loss": 0.7303,
+      "step": 15700
+    },
+    {
+      "epoch": 1.1522488195292702,
+      "grad_norm": 0.8657513856887817,
+      "learning_rate": 0.00012469731272763069,
+      "loss": 0.7431,
+      "step": 15800
+    },
+    {
+      "epoch": 1.1595413028021366,
+      "grad_norm": 0.8745185136795044,
+      "learning_rate": 0.00012420513830101388,
+      "loss": 0.7426,
+      "step": 15900
+    },
+    {
+      "epoch": 1.166833786075003,
+      "grad_norm": 0.8729378581047058,
+      "learning_rate": 0.0001237129638743971,
+      "loss": 0.7389,
+      "step": 16000
+    },
+    {
+      "epoch": 1.166833786075003,
+      "eval_loss": 0.740699291229248,
+      "eval_runtime": 60.635,
+      "eval_samples_per_second": 147.687,
+      "eval_steps_per_second": 18.471,
+      "step": 16000
+    },
+    {
+      "epoch": 1.1741262693478696,
+      "grad_norm": 0.8877021670341492,
+      "learning_rate": 0.00012322078944778032,
+      "loss": 0.7419,
+      "step": 16100
+    },
+    {
+      "epoch": 1.1814187526207363,
+      "grad_norm": 0.9095293283462524,
+      "learning_rate": 0.0001227286150211635,
+      "loss": 0.7365,
+      "step": 16200
+    },
+    {
+      "epoch": 1.1887112358936027,
+      "grad_norm": 0.8597880601882935,
+      "learning_rate": 0.0001222364405945467,
+      "loss": 0.7336,
+      "step": 16300
+    },
+    {
+      "epoch": 1.1960037191664692,
+      "grad_norm": 0.9574359059333801,
+      "learning_rate": 0.0001217442661679299,
+      "loss": 0.7394,
+      "step": 16400
+    },
+    {
+      "epoch": 1.2032962024393357,
+      "grad_norm": 0.8484875559806824,
+      "learning_rate": 0.00012125209174131314,
+      "loss": 0.7392,
+      "step": 16500
+    },
+    {
+      "epoch": 1.2105886857122021,
+      "grad_norm": 0.8847618699073792,
+      "learning_rate": 0.00012075991731469633,
+      "loss": 0.7427,
+      "step": 16600
+    },
+    {
+      "epoch": 1.2178811689850686,
+      "grad_norm": 0.8780632019042969,
+      "learning_rate": 0.00012026774288807954,
+      "loss": 0.7399,
+      "step": 16700
+    },
+    {
+      "epoch": 1.225173652257935,
+      "grad_norm": 0.8698965311050415,
+      "learning_rate": 0.00011977556846146274,
+      "loss": 0.7395,
+      "step": 16800
+    },
+    {
+      "epoch": 1.2324661355308015,
+      "grad_norm": 0.8717935085296631,
+      "learning_rate": 0.00011928339403484596,
+      "loss": 0.7404,
+      "step": 16900
+    },
+    {
+      "epoch": 1.239758618803668,
+      "grad_norm": 0.8375683426856995,
+      "learning_rate": 0.00011879121960822917,
+      "loss": 0.7405,
+      "step": 17000
+    },
+    {
+      "epoch": 1.239758618803668,
+      "eval_loss": 0.7371787428855896,
+      "eval_runtime": 60.9373,
+      "eval_samples_per_second": 146.954,
+      "eval_steps_per_second": 18.38,
+      "step": 17000
+    },
+    {
+      "epoch": 1.2470511020765347,
+      "grad_norm": 0.8756095170974731,
+      "learning_rate": 0.00011829904518161237,
+      "loss": 0.736,
+      "step": 17100
+    },
+    {
+      "epoch": 1.2543435853494012,
+      "grad_norm": 0.8513076901435852,
+      "learning_rate": 0.00011780687075499556,
+      "loss": 0.7399,
+      "step": 17200
+    },
+    {
+      "epoch": 1.2616360686222676,
+      "grad_norm": 0.8297843337059021,
+      "learning_rate": 0.0001173146963283788,
+      "loss": 0.7406,
+      "step": 17300
+    },
+    {
+      "epoch": 1.268928551895134,
+      "grad_norm": 0.8896269202232361,
+      "learning_rate": 0.00011682252190176199,
+      "loss": 0.7346,
+      "step": 17400
+    },
+    {
+      "epoch": 1.2762210351680006,
+      "grad_norm": 0.874168336391449,
+      "learning_rate": 0.0001163303474751452,
+      "loss": 0.736,
+      "step": 17500
+    },
+    {
+      "epoch": 1.283513518440867,
+      "grad_norm": 0.9101394414901733,
+      "learning_rate": 0.0001158381730485284,
+      "loss": 0.7376,
+      "step": 17600
+    },
+    {
+      "epoch": 1.2908060017137335,
+      "grad_norm": 0.9011333584785461,
+      "learning_rate": 0.00011534599862191162,
+      "loss": 0.7361,
+      "step": 17700
+    },
+    {
+      "epoch": 1.2980984849866002,
+      "grad_norm": 0.8839349746704102,
+      "learning_rate": 0.00011485382419529482,
+      "loss": 0.7373,
+      "step": 17800
+    },
+    {
+      "epoch": 1.3053909682594664,
+      "grad_norm": 0.830528974533081,
+      "learning_rate": 0.00011436164976867803,
+      "loss": 0.7336,
+      "step": 17900
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "grad_norm": 0.8777081370353699,
+      "learning_rate": 0.00011386947534206122,
+      "loss": 0.7379,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "eval_loss": 0.7359282970428467,
+      "eval_runtime": 60.8023,
+      "eval_samples_per_second": 147.281,
+      "eval_steps_per_second": 18.42,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3199759348051996,
+      "grad_norm": 0.8853510022163391,
+      "learning_rate": 0.00011337730091544443,
+      "loss": 0.7376,
+      "step": 18100
+    },
+    {
+      "epoch": 1.327268418078066,
+      "grad_norm": 0.9219810366630554,
+      "learning_rate": 0.00011288512648882766,
+      "loss": 0.7399,
+      "step": 18200
+    },
+    {
+      "epoch": 1.3345609013509325,
+      "grad_norm": 0.9233282208442688,
+      "learning_rate": 0.00011239295206221085,
+      "loss": 0.7399,
+      "step": 18300
+    },
+    {
+      "epoch": 1.341853384623799,
+      "grad_norm": 0.8359719514846802,
+      "learning_rate": 0.00011190077763559406,
+      "loss": 0.7366,
+      "step": 18400
+    },
+    {
+      "epoch": 1.3491458678966655,
+      "grad_norm": 0.8673479557037354,
+      "learning_rate": 0.00011140860320897726,
+      "loss": 0.7398,
+      "step": 18500
+    },
+    {
+      "epoch": 1.356438351169532,
+      "grad_norm": 0.8565610647201538,
+      "learning_rate": 0.00011091642878236048,
+      "loss": 0.7278,
+      "step": 18600
+    },
+    {
+      "epoch": 1.3637308344423986,
+      "grad_norm": 0.8547226190567017,
+      "learning_rate": 0.00011042425435574369,
+      "loss": 0.7381,
+      "step": 18700
+    },
+    {
+      "epoch": 1.3710233177152649,
+      "grad_norm": 0.897081732749939,
+      "learning_rate": 0.00010993207992912688,
+      "loss": 0.7339,
+      "step": 18800
+    },
+    {
+      "epoch": 1.3783158009881316,
+      "grad_norm": 0.8852410912513733,
+      "learning_rate": 0.00010943990550251008,
+      "loss": 0.7342,
+      "step": 18900
+    },
+    {
+      "epoch": 1.385608284260998,
+      "grad_norm": 0.9213690161705017,
+      "learning_rate": 0.00010894773107589332,
+      "loss": 0.7389,
+      "step": 19000
+    },
+    {
+      "epoch": 1.385608284260998,
+      "eval_loss": 0.7335625886917114,
+      "eval_runtime": 60.8231,
+      "eval_samples_per_second": 147.23,
+      "eval_steps_per_second": 18.414,
+      "step": 19000
+    },
+    {
+      "epoch": 1.3929007675338645,
+      "grad_norm": 0.8398423790931702,
+      "learning_rate": 0.00010845555664927651,
+      "loss": 0.7274,
+      "step": 19100
+    },
+    {
+      "epoch": 1.400193250806731,
+      "grad_norm": 0.8863806128501892,
+      "learning_rate": 0.00010796338222265971,
+      "loss": 0.7331,
+      "step": 19200
+    },
+    {
+      "epoch": 1.4074857340795974,
+      "grad_norm": 0.8836521506309509,
+      "learning_rate": 0.00010747120779604292,
+      "loss": 0.7334,
+      "step": 19300
+    },
+    {
+      "epoch": 1.414778217352464,
+      "grad_norm": 0.8278964757919312,
+      "learning_rate": 0.00010697903336942614,
+      "loss": 0.7281,
+      "step": 19400
+    },
+    {
+      "epoch": 1.4220707006253304,
+      "grad_norm": 0.8681420087814331,
+      "learning_rate": 0.00010648685894280934,
+      "loss": 0.7345,
+      "step": 19500
+    },
+    {
+      "epoch": 1.429363183898197,
+      "grad_norm": 0.8721694946289062,
+      "learning_rate": 0.00010599468451619255,
+      "loss": 0.7246,
+      "step": 19600
+    },
+    {
+      "epoch": 1.4366556671710633,
+      "grad_norm": 0.8880037665367126,
+      "learning_rate": 0.00010550251008957574,
+      "loss": 0.7321,
+      "step": 19700
+    },
+    {
+      "epoch": 1.44394815044393,
+      "grad_norm": 0.8522552251815796,
+      "learning_rate": 0.00010501033566295895,
+      "loss": 0.734,
+      "step": 19800
+    },
+    {
+      "epoch": 1.4512406337167965,
+      "grad_norm": 0.8816943168640137,
+      "learning_rate": 0.00010451816123634217,
+      "loss": 0.7333,
+      "step": 19900
+    },
+    {
+      "epoch": 1.458533116989663,
+      "grad_norm": 0.8068501949310303,
+      "learning_rate": 0.00010402598680972537,
+      "loss": 0.7267,
+      "step": 20000
+    },
+    {
+      "epoch": 1.458533116989663,
+      "eval_loss": 0.731645405292511,
+      "eval_runtime": 61.0998,
+      "eval_samples_per_second": 146.563,
+      "eval_steps_per_second": 18.331,
+      "step": 20000
+    },
+    {
+      "epoch": 1.4658256002625294,
+      "grad_norm": 0.8473337888717651,
+      "learning_rate": 0.00010353381238310858,
+      "loss": 0.7328,
+      "step": 20100
+    },
+    {
+      "epoch": 1.4731180835353959,
+      "grad_norm": 0.9009122252464294,
+      "learning_rate": 0.00010304163795649177,
+      "loss": 0.733,
+      "step": 20200
+    },
+    {
+      "epoch": 1.4804105668082623,
+      "grad_norm": 0.8225035667419434,
+      "learning_rate": 0.000102549463529875,
+      "loss": 0.7311,
+      "step": 20300
+    },
+    {
+      "epoch": 1.4877030500811288,
+      "grad_norm": 0.8552617430686951,
+      "learning_rate": 0.00010205728910325821,
+      "loss": 0.7282,
+      "step": 20400
+    },
+    {
+      "epoch": 1.4949955333539955,
+      "grad_norm": 0.8690235614776611,
+      "learning_rate": 0.0001015651146766414,
+      "loss": 0.7329,
+      "step": 20500
+    },
+    {
+      "epoch": 1.5022880166268617,
+      "grad_norm": 0.8566781878471375,
+      "learning_rate": 0.0001010729402500246,
+      "loss": 0.7358,
+      "step": 20600
+    },
+    {
+      "epoch": 1.5095804998997284,
+      "grad_norm": 0.9174933433532715,
+      "learning_rate": 0.00010058076582340782,
+      "loss": 0.7266,
+      "step": 20700
+    },
+    {
+      "epoch": 1.516872983172595,
+      "grad_norm": 0.9414506554603577,
+      "learning_rate": 0.00010008859139679103,
+      "loss": 0.7321,
+      "step": 20800
+    },
+    {
+      "epoch": 1.5241654664454614,
+      "grad_norm": 0.9433586001396179,
+      "learning_rate": 9.959641697017424e-05,
+      "loss": 0.7355,
+      "step": 20900
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "grad_norm": 0.8544315695762634,
+      "learning_rate": 9.910424254355744e-05,
+      "loss": 0.7313,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "eval_loss": 0.7285299301147461,
+      "eval_runtime": 60.6886,
+      "eval_samples_per_second": 147.557,
+      "eval_steps_per_second": 18.455,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5387504329911943,
+      "grad_norm": 0.893223762512207,
+      "learning_rate": 9.861206811694065e-05,
+      "loss": 0.7329,
+      "step": 21100
+    },
+    {
+      "epoch": 1.546042916264061,
+      "grad_norm": 0.8868634104728699,
+      "learning_rate": 9.811989369032387e-05,
+      "loss": 0.7276,
+      "step": 21200
+    },
+    {
+      "epoch": 1.5533353995369272,
+      "grad_norm": 0.8362566232681274,
+      "learning_rate": 9.762771926370706e-05,
+      "loss": 0.723,
+      "step": 21300
+    },
+    {
+      "epoch": 1.560627882809794,
+      "grad_norm": 0.8852083086967468,
+      "learning_rate": 9.713554483709026e-05,
+      "loss": 0.7281,
+      "step": 21400
+    },
+    {
+      "epoch": 1.5679203660826602,
+      "grad_norm": 0.8901813626289368,
+      "learning_rate": 9.664337041047348e-05,
+      "loss": 0.7307,
+      "step": 21500
+    },
+    {
+      "epoch": 1.5752128493555269,
+      "grad_norm": 0.8210172057151794,
+      "learning_rate": 9.615119598385667e-05,
+      "loss": 0.7245,
+      "step": 21600
+    },
+    {
+      "epoch": 1.5825053326283933,
+      "grad_norm": 0.8676414489746094,
+      "learning_rate": 9.56590215572399e-05,
+      "loss": 0.7294,
+      "step": 21700
+    },
+    {
+      "epoch": 1.5897978159012598,
+      "grad_norm": 0.8923740983009338,
+      "learning_rate": 9.51668471306231e-05,
+      "loss": 0.7242,
+      "step": 21800
+    },
+    {
+      "epoch": 1.5970902991741263,
+      "grad_norm": 0.8402920365333557,
+      "learning_rate": 9.46746727040063e-05,
+      "loss": 0.7258,
+      "step": 21900
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "grad_norm": 0.8525983691215515,
+      "learning_rate": 9.418249827738951e-05,
+      "loss": 0.7294,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "eval_loss": 0.7267495393753052,
+      "eval_runtime": 61.1086,
+      "eval_samples_per_second": 146.542,
+      "eval_steps_per_second": 18.328,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6116752657198594,
+      "grad_norm": 0.8605002164840698,
+      "learning_rate": 9.369032385077272e-05,
+      "loss": 0.7259,
+      "step": 22100
+    },
+    {
+      "epoch": 1.6189677489927257,
+      "grad_norm": 0.8606895208358765,
+      "learning_rate": 9.319814942415592e-05,
+      "loss": 0.7275,
+      "step": 22200
+    },
+    {
+      "epoch": 1.6262602322655924,
+      "grad_norm": 0.8824227452278137,
+      "learning_rate": 9.270597499753914e-05,
+      "loss": 0.7245,
+      "step": 22300
+    },
+    {
+      "epoch": 1.6335527155384586,
+      "grad_norm": 0.8670118451118469,
+      "learning_rate": 9.221380057092233e-05,
+      "loss": 0.719,
+      "step": 22400
+    },
+    {
+      "epoch": 1.6408451988113253,
+      "grad_norm": 0.92063307762146,
+      "learning_rate": 9.172162614430555e-05,
+      "loss": 0.7293,
+      "step": 22500
+    },
+    {
+      "epoch": 1.6481376820841918,
+      "grad_norm": 0.8425260782241821,
+      "learning_rate": 9.122945171768876e-05,
+      "loss": 0.728,
+      "step": 22600
+    },
+    {
+      "epoch": 1.6554301653570582,
+      "grad_norm": 0.9162302017211914,
+      "learning_rate": 9.073727729107196e-05,
+      "loss": 0.7265,
+      "step": 22700
+    },
+    {
+      "epoch": 1.6627226486299247,
+      "grad_norm": 0.8905067443847656,
+      "learning_rate": 9.024510286445517e-05,
+      "loss": 0.7256,
+      "step": 22800
+    },
+    {
+      "epoch": 1.6700151319027912,
+      "grad_norm": 0.874357283115387,
+      "learning_rate": 8.975292843783837e-05,
+      "loss": 0.7249,
+      "step": 22900
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "grad_norm": 0.842005729675293,
+      "learning_rate": 8.926075401122158e-05,
+      "loss": 0.7268,
+      "step": 23000
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "eval_loss": 0.7241798639297485,
+      "eval_runtime": 60.7958,
+      "eval_samples_per_second": 147.296,
+      "eval_steps_per_second": 18.422,
+      "step": 23000
+    },
+    {
+      "epoch": 1.684600098448524,
+      "grad_norm": 0.8695193529129028,
+      "learning_rate": 8.876857958460478e-05,
+      "loss": 0.7262,
+      "step": 23100
+    },
+    {
+      "epoch": 1.6918925817213908,
+      "grad_norm": 0.8673058748245239,
+      "learning_rate": 8.827640515798799e-05,
+      "loss": 0.7303,
+      "step": 23200
+    },
+    {
+      "epoch": 1.699185064994257,
+      "grad_norm": 0.9276596307754517,
+      "learning_rate": 8.77842307313712e-05,
+      "loss": 0.729,
+      "step": 23300
+    },
+    {
+      "epoch": 1.7064775482671237,
+      "grad_norm": 0.8023722171783447,
+      "learning_rate": 8.729205630475441e-05,
+      "loss": 0.7212,
+      "step": 23400
+    },
+    {
+      "epoch": 1.7137700315399902,
+      "grad_norm": 0.910897433757782,
+      "learning_rate": 8.67998818781376e-05,
+      "loss": 0.7252,
+      "step": 23500
+    },
+    {
+      "epoch": 1.7210625148128567,
+      "grad_norm": 0.8714926838874817,
+      "learning_rate": 8.630770745152083e-05,
+      "loss": 0.7306,
+      "step": 23600
+    },
+    {
+      "epoch": 1.7283549980857231,
+      "grad_norm": 0.8875166773796082,
+      "learning_rate": 8.581553302490403e-05,
+      "loss": 0.7235,
+      "step": 23700
+    },
+    {
+      "epoch": 1.7356474813585896,
+      "grad_norm": 0.9132345914840698,
+      "learning_rate": 8.532335859828724e-05,
+      "loss": 0.7331,
+      "step": 23800
+    },
+    {
+      "epoch": 1.7429399646314563,
+      "grad_norm": 0.8562710285186768,
+      "learning_rate": 8.483118417167044e-05,
+      "loss": 0.7282,
+      "step": 23900
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "grad_norm": 0.867508590221405,
+      "learning_rate": 8.433900974505365e-05,
+      "loss": 0.7256,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "eval_loss": 0.7232645153999329,
+      "eval_runtime": 60.377,
+      "eval_samples_per_second": 148.318,
+      "eval_steps_per_second": 18.55,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7575249311771892,
+      "grad_norm": 0.8258200287818909,
+      "learning_rate": 8.384683531843685e-05,
+      "loss": 0.7254,
+      "step": 24100
+    },
+    {
+      "epoch": 1.7648174144500555,
+      "grad_norm": 0.9109018445014954,
+      "learning_rate": 8.335466089182007e-05,
+      "loss": 0.7315,
+      "step": 24200
+    },
+    {
+      "epoch": 1.7721098977229222,
+      "grad_norm": 0.8500842452049255,
+      "learning_rate": 8.286248646520326e-05,
+      "loss": 0.7265,
+      "step": 24300
+    },
+    {
+      "epoch": 1.7794023809957886,
+      "grad_norm": 0.9286713600158691,
+      "learning_rate": 8.237031203858648e-05,
+      "loss": 0.7247,
+      "step": 24400
+    },
+    {
+      "epoch": 1.786694864268655,
+      "grad_norm": 0.8746926188468933,
+      "learning_rate": 8.187813761196969e-05,
+      "loss": 0.7261,
+      "step": 24500
+    },
+    {
+      "epoch": 1.7939873475415216,
+      "grad_norm": 0.8702288866043091,
+      "learning_rate": 8.13859631853529e-05,
+      "loss": 0.7207,
+      "step": 24600
+    },
+    {
+      "epoch": 1.801279830814388,
+      "grad_norm": 0.9746344089508057,
+      "learning_rate": 8.08937887587361e-05,
+      "loss": 0.728,
+      "step": 24700
+    },
+    {
+      "epoch": 1.8085723140872547,
+      "grad_norm": 0.8815904259681702,
+      "learning_rate": 8.04016143321193e-05,
+      "loss": 0.7174,
+      "step": 24800
+    },
+    {
+      "epoch": 1.815864797360121,
+      "grad_norm": 0.870474100112915,
+      "learning_rate": 7.990943990550251e-05,
+      "loss": 0.7316,
+      "step": 24900
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "grad_norm": 0.8451401591300964,
+      "learning_rate": 7.941726547888572e-05,
+      "loss": 0.7202,
+      "step": 25000
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "eval_loss": 0.721147358417511,
+      "eval_runtime": 60.8906,
+      "eval_samples_per_second": 147.067,
+      "eval_steps_per_second": 18.394,
+      "step": 25000
+    },
+    {
+      "epoch": 1.830449763905854,
+      "grad_norm": 0.8878180980682373,
+      "learning_rate": 7.892509105226894e-05,
+      "loss": 0.7236,
+      "step": 25100
+    },
+    {
+      "epoch": 1.8377422471787206,
+      "grad_norm": 0.859920859336853,
+      "learning_rate": 7.843291662565213e-05,
+      "loss": 0.7257,
+      "step": 25200
+    },
+    {
+      "epoch": 1.845034730451587,
+      "grad_norm": 0.9358228445053101,
+      "learning_rate": 7.794074219903535e-05,
+      "loss": 0.7175,
+      "step": 25300
+    },
+    {
+      "epoch": 1.8523272137244535,
+      "grad_norm": 0.858906626701355,
+      "learning_rate": 7.744856777241854e-05,
+      "loss": 0.7217,
+      "step": 25400
+    },
+    {
+      "epoch": 1.85961969699732,
+      "grad_norm": 0.9508287310600281,
+      "learning_rate": 7.695639334580176e-05,
+      "loss": 0.7211,
+      "step": 25500
+    },
+    {
+      "epoch": 1.8669121802701865,
+      "grad_norm": 0.9340062141418457,
+      "learning_rate": 7.646421891918496e-05,
+      "loss": 0.7254,
+      "step": 25600
+    },
+    {
+      "epoch": 1.8742046635430532,
+      "grad_norm": 0.9350687861442566,
+      "learning_rate": 7.597204449256817e-05,
+      "loss": 0.7247,
+      "step": 25700
+    },
+    {
+      "epoch": 1.8814971468159194,
+      "grad_norm": 0.9614841938018799,
+      "learning_rate": 7.547987006595137e-05,
+      "loss": 0.7283,
+      "step": 25800
+    },
+    {
+      "epoch": 1.888789630088786,
+      "grad_norm": 0.848640501499176,
+      "learning_rate": 7.49876956393346e-05,
+      "loss": 0.7221,
+      "step": 25900
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "grad_norm": 0.8105534315109253,
+      "learning_rate": 7.449552121271779e-05,
+      "loss": 0.7205,
+      "step": 26000
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "eval_loss": 0.7193262577056885,
+      "eval_runtime": 61.1614,
+      "eval_samples_per_second": 146.416,
+      "eval_steps_per_second": 18.312,
+      "step": 26000
+    },
+    {
+      "epoch": 1.903374596634519,
+      "grad_norm": 0.8522207736968994,
+      "learning_rate": 7.4003346786101e-05,
+      "loss": 0.7223,
+      "step": 26100
+    },
+    {
+      "epoch": 1.9106670799073855,
+      "grad_norm": 0.8983740210533142,
+      "learning_rate": 7.351117235948421e-05,
+      "loss": 0.7208,
+      "step": 26200
+    },
+    {
+      "epoch": 1.917959563180252,
+      "grad_norm": 0.8596473336219788,
+      "learning_rate": 7.301899793286742e-05,
+      "loss": 0.7184,
+      "step": 26300
+    },
+    {
+      "epoch": 1.9252520464531184,
+      "grad_norm": 0.9175098538398743,
+      "learning_rate": 7.252682350625062e-05,
+      "loss": 0.7213,
+      "step": 26400
+    },
+    {
+      "epoch": 1.932544529725985,
+      "grad_norm": 0.8626872897148132,
+      "learning_rate": 7.203464907963383e-05,
+      "loss": 0.7242,
+      "step": 26500
+    },
+    {
+      "epoch": 1.9398370129988516,
+      "grad_norm": 0.859780490398407,
+      "learning_rate": 7.154247465301703e-05,
+      "loss": 0.7197,
+      "step": 26600
+    },
+    {
+      "epoch": 1.9471294962717178,
+      "grad_norm": 0.8713703751564026,
+      "learning_rate": 7.105030022640024e-05,
+      "loss": 0.7231,
+      "step": 26700
+    },
+    {
+      "epoch": 1.9544219795445845,
+      "grad_norm": 0.8976535797119141,
+      "learning_rate": 7.055812579978344e-05,
+      "loss": 0.7233,
+      "step": 26800
+    },
+    {
+      "epoch": 1.9617144628174508,
+      "grad_norm": 0.9257802367210388,
+      "learning_rate": 7.006595137316665e-05,
+      "loss": 0.7221,
+      "step": 26900
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "grad_norm": 0.8592785596847534,
+      "learning_rate": 6.957377694654987e-05,
+      "loss": 0.7168,
+      "step": 27000
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "eval_loss": 0.7180259227752686,
+      "eval_runtime": 60.5352,
+      "eval_samples_per_second": 147.931,
+      "eval_steps_per_second": 18.502,
+      "step": 27000
+    },
+    {
+      "epoch": 1.976299429363184,
+      "grad_norm": 0.8931472897529602,
+      "learning_rate": 6.908160251993306e-05,
+      "loss": 0.7204,
+      "step": 27100
+    },
+    {
+      "epoch": 1.9835919126360504,
+      "grad_norm": 0.8821597695350647,
+      "learning_rate": 6.858942809331628e-05,
+      "loss": 0.7163,
+      "step": 27200
+    },
+    {
+      "epoch": 1.9908843959089169,
+      "grad_norm": 0.8749621510505676,
+      "learning_rate": 6.809725366669948e-05,
+      "loss": 0.711,
+      "step": 27300
+    },
+    {
+      "epoch": 1.9981768791817833,
+      "grad_norm": 0.903332531452179,
+      "learning_rate": 6.760507924008269e-05,
+      "loss": 0.7176,
+      "step": 27400
+    },
+    {
+      "epoch": 2.005505824871014,
+      "grad_norm": 0.854773759841919,
+      "learning_rate": 6.71129048134659e-05,
+      "loss": 0.7187,
+      "step": 27500
+    },
+    {
+      "epoch": 2.0127983081438807,
+      "grad_norm": 0.9489893913269043,
+      "learning_rate": 6.66207303868491e-05,
+      "loss": 0.7096,
+      "step": 27600
+    },
+    {
+      "epoch": 2.020090791416747,
+      "grad_norm": 0.8944621682167053,
+      "learning_rate": 6.61285559602323e-05,
+      "loss": 0.7104,
+      "step": 27700
+    },
+    {
+      "epoch": 2.0273832746896137,
+      "grad_norm": 0.8567011952400208,
+      "learning_rate": 6.563638153361553e-05,
+      "loss": 0.7124,
+      "step": 27800
+    },
+    {
+      "epoch": 2.0346757579624803,
+      "grad_norm": 0.8737155199050903,
+      "learning_rate": 6.514420710699872e-05,
+      "loss": 0.7127,
+      "step": 27900
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "grad_norm": 0.8935887813568115,
+      "learning_rate": 6.465203268038194e-05,
+      "loss": 0.7122,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "eval_loss": 0.716705858707428,
+      "eval_runtime": 60.7739,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0492607245082133,
+      "grad_norm": 0.9452987313270569,
+      "learning_rate": 6.415985825376514e-05,
+      "loss": 0.7112,
+      "step": 28100
+    },
+    {
+      "epoch": 2.0565532077810795,
+      "grad_norm": 0.8650675415992737,
+      "learning_rate": 6.366768382714833e-05,
+      "loss": 0.7079,
+      "step": 28200
+    },
+    {
+      "epoch": 2.063845691053946,
+      "grad_norm": 0.8913034796714783,
+      "learning_rate": 6.317550940053155e-05,
+      "loss": 0.713,
+      "step": 28300
+    },
+    {
+      "epoch": 2.0711381743268125,
+      "grad_norm": 0.9072710275650024,
+      "learning_rate": 6.268333497391476e-05,
+      "loss": 0.7094,
+      "step": 28400
+    },
+    {
+      "epoch": 2.078430657599679,
+      "grad_norm": 0.854245126247406,
+      "learning_rate": 6.219116054729796e-05,
+      "loss": 0.7077,
+      "step": 28500
+    },
+    {
+      "epoch": 2.0857231408725454,
+      "grad_norm": 0.929263174533844,
+      "learning_rate": 6.169898612068117e-05,
+      "loss": 0.7086,
+      "step": 28600
+    },
+    {
+      "epoch": 2.093015624145412,
+      "grad_norm": 0.9356215596199036,
+      "learning_rate": 6.120681169406438e-05,
+      "loss": 0.7157,
+      "step": 28700
+    },
+    {
+      "epoch": 2.100308107418279,
+      "grad_norm": 0.9242870211601257,
+      "learning_rate": 6.071463726744758e-05,
+      "loss": 0.71,
+      "step": 28800
+    },
+    {
+      "epoch": 2.107600590691145,
+      "grad_norm": 0.9065095782279968,
+      "learning_rate": 6.022246284083079e-05,
+      "loss": 0.7095,
+      "step": 28900
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "grad_norm": 0.9081276059150696,
+      "learning_rate": 5.9730288414214e-05,
+      "loss": 0.7096,
+      "step": 29000
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "eval_loss": 0.7152244448661804,
+      "eval_runtime": 60.7986,
+      "eval_samples_per_second": 147.29,
+      "eval_steps_per_second": 18.421,
+      "step": 29000
+    },
+    {
+      "epoch": 2.122185557236878,
+      "grad_norm": 0.8326215744018555,
+      "learning_rate": 5.923811398759721e-05,
+      "loss": 0.7147,
+      "step": 29100
+    },
+    {
+      "epoch": 2.1294780405097447,
+      "grad_norm": 0.9274723529815674,
+      "learning_rate": 5.874593956098041e-05,
+      "loss": 0.7111,
+      "step": 29200
+    },
+    {
+      "epoch": 2.136770523782611,
+      "grad_norm": 0.8282331824302673,
+      "learning_rate": 5.825376513436362e-05,
+      "loss": 0.7137,
+      "step": 29300
+    },
+    {
+      "epoch": 2.1440630070554776,
+      "grad_norm": 0.9081612229347229,
+      "learning_rate": 5.776159070774683e-05,
+      "loss": 0.7115,
+      "step": 29400
+    },
+    {
+      "epoch": 2.151355490328344,
+      "grad_norm": 0.9531508684158325,
+      "learning_rate": 5.726941628113004e-05,
+      "loss": 0.708,
+      "step": 29500
+    },
+    {
+      "epoch": 2.1586479736012105,
+      "grad_norm": 0.9125275611877441,
+      "learning_rate": 5.677724185451324e-05,
+      "loss": 0.7123,
+      "step": 29600
+    },
+    {
+      "epoch": 2.165940456874077,
+      "grad_norm": 0.9363859295845032,
+      "learning_rate": 5.628506742789645e-05,
+      "loss": 0.7146,
+      "step": 29700
+    },
+    {
+      "epoch": 2.1732329401469435,
+      "grad_norm": 0.9164854884147644,
+      "learning_rate": 5.579289300127966e-05,
+      "loss": 0.7121,
+      "step": 29800
+    },
+    {
+      "epoch": 2.18052542341981,
+      "grad_norm": 0.941330075263977,
+      "learning_rate": 5.530071857466287e-05,
+      "loss": 0.7086,
+      "step": 29900
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "grad_norm": 0.9006567597389221,
+      "learning_rate": 5.480854414804607e-05,
+      "loss": 0.7097,
+      "step": 30000
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "eval_loss": 0.7143043875694275,
+      "eval_runtime": 61.0555,
+      "eval_samples_per_second": 146.67,
+      "eval_steps_per_second": 18.344,
+      "step": 30000
+    },
+    {
+      "epoch": 2.195110389965543,
+      "grad_norm": 0.8913944363594055,
+      "learning_rate": 5.431636972142927e-05,
+      "loss": 0.7066,
+      "step": 30100
+    },
+    {
+      "epoch": 2.2024028732384093,
+      "grad_norm": 0.9200546145439148,
+      "learning_rate": 5.3824195294812486e-05,
+      "loss": 0.7076,
+      "step": 30200
+    },
+    {
+      "epoch": 2.209695356511276,
+      "grad_norm": 0.924148440361023,
+      "learning_rate": 5.3332020868195684e-05,
+      "loss": 0.7058,
+      "step": 30300
+    },
+    {
+      "epoch": 2.2169878397841423,
+      "grad_norm": 0.922255277633667,
+      "learning_rate": 5.2839846441578897e-05,
+      "loss": 0.7108,
+      "step": 30400
+    },
+    {
+      "epoch": 2.224280323057009,
+      "grad_norm": 0.9039818644523621,
+      "learning_rate": 5.23476720149621e-05,
+      "loss": 0.7091,
+      "step": 30500
+    },
+    {
+      "epoch": 2.2315728063298756,
+      "grad_norm": 0.963845431804657,
+      "learning_rate": 5.1855497588345314e-05,
+      "loss": 0.7065,
+      "step": 30600
+    },
+    {
+      "epoch": 2.238865289602742,
+      "grad_norm": 0.8838880658149719,
+      "learning_rate": 5.136332316172851e-05,
+      "loss": 0.7113,
+      "step": 30700
+    },
+    {
+      "epoch": 2.2461577728756086,
+      "grad_norm": 0.9642555117607117,
+      "learning_rate": 5.0871148735111725e-05,
+      "loss": 0.7062,
+      "step": 30800
+    },
+    {
+      "epoch": 2.253450256148475,
+      "grad_norm": 0.9088276624679565,
+      "learning_rate": 5.037897430849493e-05,
+      "loss": 0.7071,
+      "step": 30900
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "grad_norm": 0.9083282351493835,
+      "learning_rate": 4.9886799881878137e-05,
+      "loss": 0.7126,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "eval_loss": 0.7129958868026733,
+      "eval_runtime": 60.7821,
+      "eval_samples_per_second": 147.33,
+      "eval_steps_per_second": 18.426,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2680352226942078,
+      "grad_norm": 0.886710524559021,
+      "learning_rate": 4.939462545526134e-05,
+      "loss": 0.7043,
+      "step": 31100
+    },
+    {
+      "epoch": 2.2753277059670745,
+      "grad_norm": 0.8600069880485535,
+      "learning_rate": 4.8902451028644554e-05,
+      "loss": 0.7074,
+      "step": 31200
+    },
+    {
+      "epoch": 2.2826201892399407,
+      "grad_norm": 0.8897703289985657,
+      "learning_rate": 4.841027660202776e-05,
+      "loss": 0.7068,
+      "step": 31300
+    },
+    {
+      "epoch": 2.2899126725128074,
+      "grad_norm": 0.8638718724250793,
+      "learning_rate": 4.7918102175410965e-05,
+      "loss": 0.7062,
+      "step": 31400
+    },
+    {
+      "epoch": 2.297205155785674,
+      "grad_norm": 0.8973529934883118,
+      "learning_rate": 4.742592774879418e-05,
+      "loss": 0.7073,
+      "step": 31500
+    },
+    {
+      "epoch": 2.3044976390585403,
+      "grad_norm": 0.9759765267372131,
+      "learning_rate": 4.693375332217738e-05,
+      "loss": 0.7087,
+      "step": 31600
+    },
+    {
+      "epoch": 2.311790122331407,
+      "grad_norm": 0.9061428904533386,
+      "learning_rate": 4.644157889556059e-05,
+      "loss": 0.708,
+      "step": 31700
+    },
+    {
+      "epoch": 2.3190826056042733,
+      "grad_norm": 0.8808257579803467,
+      "learning_rate": 4.5949404468943794e-05,
+      "loss": 0.7086,
+      "step": 31800
+    },
+    {
+      "epoch": 2.32637508887714,
+      "grad_norm": 0.9116071462631226,
+      "learning_rate": 4.545723004232701e-05,
+      "loss": 0.7118,
+      "step": 31900
+    },
+    {
+      "epoch": 2.333667572150006,
+      "grad_norm": 0.9131873846054077,
+      "learning_rate": 4.496505561571021e-05,
+      "loss": 0.7043,
+      "step": 32000
+    },
+    {
+      "epoch": 2.333667572150006,
+      "eval_loss": 0.7112506031990051,
+      "eval_runtime": 61.1535,
+      "eval_samples_per_second": 146.435,
+      "eval_steps_per_second": 18.315,
+      "step": 32000
+    },
+    {
+      "epoch": 2.340960055422873,
+      "grad_norm": 0.9860331416130066,
+      "learning_rate": 4.447288118909342e-05,
+      "loss": 0.7063,
+      "step": 32100
+    },
+    {
+      "epoch": 2.348252538695739,
+      "grad_norm": 0.933958888053894,
+      "learning_rate": 4.398070676247662e-05,
+      "loss": 0.708,
+      "step": 32200
+    },
+    {
+      "epoch": 2.355545021968606,
+      "grad_norm": 0.8994225859642029,
+      "learning_rate": 4.3488532335859836e-05,
+      "loss": 0.7089,
+      "step": 32300
+    },
+    {
+      "epoch": 2.3628375052414725,
+      "grad_norm": 0.9435915946960449,
+      "learning_rate": 4.299635790924304e-05,
+      "loss": 0.7057,
+      "step": 32400
+    },
+    {
+      "epoch": 2.3701299885143388,
+      "grad_norm": 0.888438880443573,
+      "learning_rate": 4.2504183482626247e-05,
+      "loss": 0.7012,
+      "step": 32500
+    },
+    {
+      "epoch": 2.3774224717872054,
+      "grad_norm": 0.8772885799407959,
+      "learning_rate": 4.201200905600945e-05,
+      "loss": 0.7071,
+      "step": 32600
+    },
+    {
+      "epoch": 2.3847149550600717,
+      "grad_norm": 0.9333481788635254,
+      "learning_rate": 4.151983462939266e-05,
+      "loss": 0.7095,
+      "step": 32700
+    },
+    {
+      "epoch": 2.3920074383329384,
+      "grad_norm": 0.9497707486152649,
+      "learning_rate": 4.102766020277586e-05,
+      "loss": 0.7115,
+      "step": 32800
+    },
+    {
+      "epoch": 2.3992999216058046,
+      "grad_norm": 0.9641472697257996,
+      "learning_rate": 4.053548577615907e-05,
+      "loss": 0.712,
+      "step": 32900
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "grad_norm": 0.8958153128623962,
+      "learning_rate": 4.004331134954228e-05,
+      "loss": 0.7035,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "eval_loss": 0.7100856304168701,
+      "eval_runtime": 61.2325,
+      "eval_samples_per_second": 146.246,
+      "eval_steps_per_second": 18.291,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4138848881515376,
+      "grad_norm": 0.8818393349647522,
+      "learning_rate": 3.9551136922925487e-05,
+      "loss": 0.7052,
+      "step": 33100
+    },
+    {
+      "epoch": 2.4211773714244043,
+      "grad_norm": 0.8973012566566467,
+      "learning_rate": 3.905896249630869e-05,
+      "loss": 0.706,
+      "step": 33200
+    },
+    {
+      "epoch": 2.428469854697271,
+      "grad_norm": 0.8582873344421387,
+      "learning_rate": 3.85667880696919e-05,
+      "loss": 0.7088,
+      "step": 33300
+    },
+    {
+      "epoch": 2.435762337970137,
+      "grad_norm": 0.9306252002716064,
+      "learning_rate": 3.807461364307511e-05,
+      "loss": 0.7062,
+      "step": 33400
+    },
+    {
+      "epoch": 2.443054821243004,
+      "grad_norm": 0.8586992025375366,
+      "learning_rate": 3.7582439216458315e-05,
+      "loss": 0.7086,
+      "step": 33500
+    },
+    {
+      "epoch": 2.45034730451587,
+      "grad_norm": 0.9076369404792786,
+      "learning_rate": 3.709026478984152e-05,
+      "loss": 0.7052,
+      "step": 33600
+    },
+    {
+      "epoch": 2.457639787788737,
+      "grad_norm": 0.8954334855079651,
+      "learning_rate": 3.6598090363224727e-05,
+      "loss": 0.7082,
+      "step": 33700
+    },
+    {
+      "epoch": 2.464932271061603,
+      "grad_norm": 0.9315345287322998,
+      "learning_rate": 3.610591593660794e-05,
+      "loss": 0.7058,
+      "step": 33800
+    },
+    {
+      "epoch": 2.4722247543344698,
+      "grad_norm": 0.9223620295524597,
+      "learning_rate": 3.5613741509991144e-05,
+      "loss": 0.6992,
+      "step": 33900
+    },
+    {
+      "epoch": 2.479517237607336,
+      "grad_norm": 0.9349290132522583,
+      "learning_rate": 3.512156708337435e-05,
+      "loss": 0.7084,
+      "step": 34000
+    },
+    {
+      "epoch": 2.479517237607336,
+      "eval_loss": 0.7087690234184265,
+      "eval_runtime": 60.8859,
+      "eval_samples_per_second": 147.078,
+      "eval_steps_per_second": 18.395,
+      "step": 34000
+    },
+    {
+      "epoch": 2.4868097208802027,
+      "grad_norm": 0.883210301399231,
+      "learning_rate": 3.462939265675756e-05,
+      "loss": 0.7061,
+      "step": 34100
+    },
+    {
+      "epoch": 2.4941022041530694,
+      "grad_norm": 0.920868456363678,
+      "learning_rate": 3.413721823014077e-05,
+      "loss": 0.7069,
+      "step": 34200
+    },
+    {
+      "epoch": 2.5013946874259356,
+      "grad_norm": 0.9177393913269043,
+      "learning_rate": 3.3645043803523966e-05,
+      "loss": 0.7071,
+      "step": 34300
+    },
+    {
+      "epoch": 2.5086871706988023,
+      "grad_norm": 0.9114101529121399,
+      "learning_rate": 3.315286937690717e-05,
+      "loss": 0.7072,
+      "step": 34400
+    },
+    {
+      "epoch": 2.5159796539716686,
+      "grad_norm": 0.9645174145698547,
+      "learning_rate": 3.2660694950290384e-05,
+      "loss": 0.7028,
+      "step": 34500
+    },
+    {
+      "epoch": 2.5232721372445353,
+      "grad_norm": 0.8982295989990234,
+      "learning_rate": 3.216852052367359e-05,
+      "loss": 0.7085,
+      "step": 34600
+    },
+    {
+      "epoch": 2.530564620517402,
+      "grad_norm": 0.8964338898658752,
+      "learning_rate": 3.1676346097056795e-05,
+      "loss": 0.7069,
+      "step": 34700
+    },
+    {
+      "epoch": 2.537857103790268,
+      "grad_norm": 0.9609666466712952,
+      "learning_rate": 3.118417167044001e-05,
+      "loss": 0.7057,
+      "step": 34800
+    },
+    {
+      "epoch": 2.5451495870631344,
+      "grad_norm": 0.9131038188934326,
+      "learning_rate": 3.069199724382321e-05,
+      "loss": 0.7031,
+      "step": 34900
+    },
+    {
+      "epoch": 2.552442070336001,
+      "grad_norm": 0.9127321839332581,
+      "learning_rate": 3.019982281720642e-05,
+      "loss": 0.6979,
+      "step": 35000
+    },
+    {
+      "epoch": 2.552442070336001,
+      "eval_loss": 0.7076790928840637,
+      "eval_runtime": 61.0966,
+      "eval_samples_per_second": 146.571,
+      "eval_steps_per_second": 18.332,
+      "step": 35000
+    },
+    {
+      "epoch": 2.559734553608868,
+      "grad_norm": 0.9567495584487915,
+      "learning_rate": 2.9707648390589628e-05,
+      "loss": 0.7053,
+      "step": 35100
+    },
+    {
+      "epoch": 2.567027036881734,
+      "grad_norm": 0.9740573763847351,
+      "learning_rate": 2.9215473963972833e-05,
+      "loss": 0.7077,
+      "step": 35200
+    },
+    {
+      "epoch": 2.5743195201546007,
+      "grad_norm": 0.8982974886894226,
+      "learning_rate": 2.8723299537356042e-05,
+      "loss": 0.6983,
+      "step": 35300
+    },
+    {
+      "epoch": 2.581612003427467,
+      "grad_norm": 1.0185188055038452,
+      "learning_rate": 2.8231125110739248e-05,
+      "loss": 0.7069,
+      "step": 35400
+    },
+    {
+      "epoch": 2.5889044867003337,
+      "grad_norm": 0.94049471616745,
+      "learning_rate": 2.7738950684122457e-05,
+      "loss": 0.7054,
+      "step": 35500
+    },
+    {
+      "epoch": 2.5961969699732004,
+      "grad_norm": 0.8923749923706055,
+      "learning_rate": 2.7246776257505662e-05,
+      "loss": 0.7015,
+      "step": 35600
+    },
+    {
+      "epoch": 2.6034894532460666,
+      "grad_norm": 0.9568887948989868,
+      "learning_rate": 2.675460183088887e-05,
+      "loss": 0.7025,
+      "step": 35700
+    },
+    {
+      "epoch": 2.610781936518933,
+      "grad_norm": 0.9106321334838867,
+      "learning_rate": 2.6262427404272077e-05,
+      "loss": 0.7049,
+      "step": 35800
+    },
+    {
+      "epoch": 2.6180744197917996,
+      "grad_norm": 0.9499268531799316,
+      "learning_rate": 2.5770252977655285e-05,
+      "loss": 0.7021,
+      "step": 35900
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "grad_norm": 0.8965421915054321,
+      "learning_rate": 2.5278078551038488e-05,
+      "loss": 0.7036,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "eval_loss": 0.7065343856811523,
+      "eval_runtime": 61.0446,
+      "eval_samples_per_second": 146.696,
+      "eval_steps_per_second": 18.347,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6326593863375325,
+      "grad_norm": 0.94576096534729,
+      "learning_rate": 2.4785904124421696e-05,
+      "loss": 0.71,
+      "step": 36100
+    },
+    {
+      "epoch": 2.639951869610399,
+      "grad_norm": 0.962692141532898,
+      "learning_rate": 2.4293729697804905e-05,
+      "loss": 0.6953,
+      "step": 36200
+    },
+    {
+      "epoch": 2.6472443528832654,
+      "grad_norm": 0.9457094669342041,
+      "learning_rate": 2.380155527118811e-05,
+      "loss": 0.7011,
+      "step": 36300
+    },
+    {
+      "epoch": 2.654536836156132,
+      "grad_norm": 0.9523045420646667,
+      "learning_rate": 2.330938084457132e-05,
+      "loss": 0.7093,
+      "step": 36400
+    },
+    {
+      "epoch": 2.661829319428999,
+      "grad_norm": 0.9255204796791077,
+      "learning_rate": 2.2817206417954522e-05,
+      "loss": 0.6979,
+      "step": 36500
+    },
+    {
+      "epoch": 2.669121802701865,
+      "grad_norm": 1.015286922454834,
+      "learning_rate": 2.232503199133773e-05,
+      "loss": 0.7044,
+      "step": 36600
+    },
+    {
+      "epoch": 2.6764142859747313,
+      "grad_norm": 0.8911315202713013,
+      "learning_rate": 2.1832857564720936e-05,
+      "loss": 0.7031,
+      "step": 36700
+    },
+    {
+      "epoch": 2.683706769247598,
+      "grad_norm": 0.9372689127922058,
+      "learning_rate": 2.1340683138104145e-05,
+      "loss": 0.7019,
+      "step": 36800
+    },
+    {
+      "epoch": 2.6909992525204647,
+      "grad_norm": 0.9245051145553589,
+      "learning_rate": 2.084850871148735e-05,
+      "loss": 0.7065,
+      "step": 36900
+    },
+    {
+      "epoch": 2.698291735793331,
+      "grad_norm": 0.917607843875885,
+      "learning_rate": 2.035633428487056e-05,
+      "loss": 0.7016,
+      "step": 37000
+    },
+    {
+      "epoch": 2.698291735793331,
+      "eval_loss": 0.7054994702339172,
+      "eval_runtime": 60.6541,
+      "eval_samples_per_second": 147.64,
+      "eval_steps_per_second": 18.465,
+      "step": 37000
+    },
+    {
+      "epoch": 2.7055842190661976,
+      "grad_norm": 0.9054610729217529,
+      "learning_rate": 1.9864159858253765e-05,
+      "loss": 0.7034,
+      "step": 37100
+    },
+    {
+      "epoch": 2.712876702339064,
+      "grad_norm": 0.960075855255127,
+      "learning_rate": 1.9371985431636974e-05,
+      "loss": 0.7097,
+      "step": 37200
+    },
+    {
+      "epoch": 2.7201691856119306,
+      "grad_norm": 0.9454420208930969,
+      "learning_rate": 1.887981100502018e-05,
+      "loss": 0.7046,
+      "step": 37300
+    },
+    {
+      "epoch": 2.7274616688847972,
+      "grad_norm": 0.8761453628540039,
+      "learning_rate": 1.8387636578403385e-05,
+      "loss": 0.7068,
+      "step": 37400
+    },
+    {
+      "epoch": 2.7347541521576635,
+      "grad_norm": 0.9231957793235779,
+      "learning_rate": 1.7895462151786594e-05,
+      "loss": 0.6983,
+      "step": 37500
+    },
+    {
+      "epoch": 2.7420466354305297,
+      "grad_norm": 0.8630309104919434,
+      "learning_rate": 1.74032877251698e-05,
+      "loss": 0.6984,
+      "step": 37600
+    },
+    {
+      "epoch": 2.7493391187033964,
+      "grad_norm": 0.9077728986740112,
+      "learning_rate": 1.691111329855301e-05,
+      "loss": 0.7097,
+      "step": 37700
+    },
+    {
+      "epoch": 2.756631601976263,
+      "grad_norm": 0.9849316477775574,
+      "learning_rate": 1.6418938871936214e-05,
+      "loss": 0.7025,
+      "step": 37800
+    },
+    {
+      "epoch": 2.7639240852491294,
+      "grad_norm": 0.9101927280426025,
+      "learning_rate": 1.5926764445319423e-05,
+      "loss": 0.7127,
+      "step": 37900
+    },
+    {
+      "epoch": 2.771216568521996,
+      "grad_norm": 0.9624613523483276,
+      "learning_rate": 1.543459001870263e-05,
+      "loss": 0.7038,
+      "step": 38000
+    },
+    {
+      "epoch": 2.771216568521996,
+      "eval_loss": 0.7042670845985413,
+      "eval_runtime": 60.6288,
+      "eval_samples_per_second": 147.702,
+      "eval_steps_per_second": 18.473,
+      "step": 38000
+    },
+    {
+      "epoch": 2.7785090517948623,
+      "grad_norm": 0.8926946520805359,
+      "learning_rate": 1.4942415592085838e-05,
+      "loss": 0.6955,
+      "step": 38100
+    },
+    {
+      "epoch": 2.785801535067729,
+      "grad_norm": 0.9353916645050049,
+      "learning_rate": 1.4450241165469041e-05,
+      "loss": 0.7003,
+      "step": 38200
+    },
+    {
+      "epoch": 2.7930940183405957,
+      "grad_norm": 0.9394625425338745,
+      "learning_rate": 1.3958066738852249e-05,
+      "loss": 0.6963,
+      "step": 38300
+    },
+    {
+      "epoch": 2.800386501613462,
+      "grad_norm": 0.8811284303665161,
+      "learning_rate": 1.3465892312235456e-05,
+      "loss": 0.7057,
+      "step": 38400
+    },
+    {
+      "epoch": 2.807678984886328,
+      "grad_norm": 0.9111167788505554,
+      "learning_rate": 1.2973717885618663e-05,
+      "loss": 0.6905,
+      "step": 38500
+    },
+    {
+      "epoch": 2.814971468159195,
+      "grad_norm": 0.9061198830604553,
+      "learning_rate": 1.248154345900187e-05,
+      "loss": 0.6966,
+      "step": 38600
+    },
+    {
+      "epoch": 2.8222639514320615,
+      "grad_norm": 0.917921781539917,
+      "learning_rate": 1.1989369032385078e-05,
+      "loss": 0.7055,
+      "step": 38700
+    },
+    {
+      "epoch": 2.829556434704928,
+      "grad_norm": 0.9210913777351379,
+      "learning_rate": 1.1497194605768285e-05,
+      "loss": 0.7004,
+      "step": 38800
+    },
+    {
+      "epoch": 2.8368489179777945,
+      "grad_norm": 0.9152899384498596,
+      "learning_rate": 1.1005020179151492e-05,
+      "loss": 0.7065,
+      "step": 38900
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "grad_norm": 0.9237668514251709,
+      "learning_rate": 1.05128457525347e-05,
+      "loss": 0.7027,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "eval_loss": 0.7034493088722229,
+      "eval_runtime": 60.6775,
+      "eval_samples_per_second": 147.583,
+      "eval_steps_per_second": 18.458,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8514338845235274,
+      "grad_norm": 0.9577778577804565,
+      "learning_rate": 1.0020671325917906e-05,
+      "loss": 0.7064,
+      "step": 39100
+    },
+    {
+      "epoch": 2.858726367796394,
+      "grad_norm": 0.9955913424491882,
+      "learning_rate": 9.528496899301114e-06,
+      "loss": 0.7017,
+      "step": 39200
+    },
+    {
+      "epoch": 2.8660188510692604,
+      "grad_norm": 0.9187660217285156,
+      "learning_rate": 9.03632247268432e-06,
+      "loss": 0.6998,
+      "step": 39300
+    },
+    {
+      "epoch": 2.8733113343421266,
+      "grad_norm": 0.9275550842285156,
+      "learning_rate": 8.544148046067526e-06,
+      "loss": 0.7002,
+      "step": 39400
+    },
+    {
+      "epoch": 2.8806038176149933,
+      "grad_norm": 0.9114721417427063,
+      "learning_rate": 8.051973619450734e-06,
+      "loss": 0.7027,
+      "step": 39500
+    },
+    {
+      "epoch": 2.88789630088786,
+      "grad_norm": 0.9408327341079712,
+      "learning_rate": 7.559799192833941e-06,
+      "loss": 0.7034,
+      "step": 39600
+    },
+    {
+      "epoch": 2.8951887841607262,
+      "grad_norm": 0.9538366198539734,
+      "learning_rate": 7.067624766217147e-06,
+      "loss": 0.7007,
+      "step": 39700
+    },
+    {
+      "epoch": 2.902481267433593,
+      "grad_norm": 0.923864483833313,
+      "learning_rate": 6.5754503396003544e-06,
+      "loss": 0.6972,
+      "step": 39800
+    },
+    {
+      "epoch": 2.909773750706459,
+      "grad_norm": 0.9156636595726013,
+      "learning_rate": 6.083275912983562e-06,
+      "loss": 0.7064,
+      "step": 39900
+    },
+    {
+      "epoch": 2.917066233979326,
+      "grad_norm": 0.9568312168121338,
+      "learning_rate": 5.591101486366768e-06,
+      "loss": 0.6969,
+      "step": 40000
+    },
+    {
+      "epoch": 2.917066233979326,
+      "eval_loss": 0.7027888894081116,
+      "eval_runtime": 61.1155,
+      "eval_samples_per_second": 146.526,
+      "eval_steps_per_second": 18.326,
+      "step": 40000
+    },
+    {
+      "epoch": 2.9243587172521925,
+      "grad_norm": 0.9376012086868286,
+      "learning_rate": 5.098927059749975e-06,
+      "loss": 0.7,
+      "step": 40100
+    },
+    {
+      "epoch": 2.931651200525059,
+      "grad_norm": 0.9648913145065308,
+      "learning_rate": 4.6067526331331825e-06,
+      "loss": 0.7042,
+      "step": 40200
+    },
+    {
+      "epoch": 2.938943683797925,
+      "grad_norm": 0.9452090263366699,
+      "learning_rate": 4.11457820651639e-06,
+      "loss": 0.7041,
+      "step": 40300
+    },
+    {
+      "epoch": 2.9462361670707917,
+      "grad_norm": 0.9553784728050232,
+      "learning_rate": 3.622403779899597e-06,
+      "loss": 0.7005,
+      "step": 40400
+    },
+    {
+      "epoch": 2.9535286503436584,
+      "grad_norm": 0.8788447380065918,
+      "learning_rate": 3.1302293532828033e-06,
+      "loss": 0.6974,
+      "step": 40500
+    },
+    {
+      "epoch": 2.9608211336165247,
+      "grad_norm": 0.9146846532821655,
+      "learning_rate": 2.6380549266660105e-06,
+      "loss": 0.7004,
+      "step": 40600
+    },
+    {
+      "epoch": 2.9681136168893913,
+      "grad_norm": 0.9674293398857117,
+      "learning_rate": 2.1458805000492173e-06,
+      "loss": 0.7028,
+      "step": 40700
+    },
+    {
+      "epoch": 2.9754061001622576,
+      "grad_norm": 0.9374125599861145,
+      "learning_rate": 1.6537060734324243e-06,
+      "loss": 0.7008,
+      "step": 40800
+    },
+    {
+      "epoch": 2.9826985834351243,
+      "grad_norm": 0.9554013013839722,
+      "learning_rate": 1.1615316468156316e-06,
+      "loss": 0.7011,
+      "step": 40900
+    },
+    {
+      "epoch": 2.989991066707991,
+      "grad_norm": 0.8910831212997437,
+      "learning_rate": 6.693572201988385e-07,
+      "loss": 0.6992,
+      "step": 41000
+    },
+    {
+      "epoch": 2.989991066707991,
+      "eval_loss": 0.7023043632507324,
+      "eval_runtime": 61.2519,
+      "eval_samples_per_second": 146.2,
+      "eval_steps_per_second": 18.285,
+      "step": 41000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 41136,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.256060461056e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-41000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
+size 5713

checkpoint-41136/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-0.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

checkpoint-41136/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-41136/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d856104b781021c43fad86f0478030885797265c8d4fffb66447b5b720f4a7
+size 8676008

checkpoint-41136/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b917ab4eb3f1678171bb7eaa0d0d28bebed4e333ef3afbe6e446040d02f3ad16
+size 17463051

checkpoint-41136/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cec9bafeaad5c1a3d2ac3267c62cccb4920f46c34a30d0fe9af9cbf6364bd451
+size 14645

checkpoint-41136/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0feaf77daa922912d1b993492fc3c7917a22158db4f0d3267724782a332eb520
+size 1465

checkpoint-41136/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3238 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9999088439590893,
+  "eval_steps": 1000,
+  "global_step": 41136,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007292483272866493,
+      "grad_norm": 2.1235318183898926,
+      "learning_rate": 4e-05,
+      "loss": 2.7429,
+      "step": 100
+    },
+    {
+      "epoch": 0.014584966545732986,
+      "grad_norm": 1.9533482789993286,
+      "learning_rate": 8e-05,
+      "loss": 1.4786,
+      "step": 200
+    },
+    {
+      "epoch": 0.02187744981859948,
+      "grad_norm": 1.5908012390136719,
+      "learning_rate": 0.00012,
+      "loss": 1.252,
+      "step": 300
+    },
+    {
+      "epoch": 0.029169933091465972,
+      "grad_norm": 1.592781662940979,
+      "learning_rate": 0.00016,
+      "loss": 1.1674,
+      "step": 400
+    },
+    {
+      "epoch": 0.036462416364332464,
+      "grad_norm": 1.4071415662765503,
+      "learning_rate": 0.0002,
+      "loss": 1.101,
+      "step": 500
+    },
+    {
+      "epoch": 0.04375489963719896,
+      "grad_norm": 1.4228886365890503,
+      "learning_rate": 0.0001995078255733832,
+      "loss": 1.0487,
+      "step": 600
+    },
+    {
+      "epoch": 0.05104738291006545,
+      "grad_norm": 1.2705847024917603,
+      "learning_rate": 0.00019901565114676642,
+      "loss": 1.0119,
+      "step": 700
+    },
+    {
+      "epoch": 0.058339866182931945,
+      "grad_norm": 1.1770137548446655,
+      "learning_rate": 0.00019852347672014964,
+      "loss": 0.9906,
+      "step": 800
+    },
+    {
+      "epoch": 0.06563234945579843,
+      "grad_norm": 1.1681164503097534,
+      "learning_rate": 0.00019803130229353283,
+      "loss": 0.9645,
+      "step": 900
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "grad_norm": 1.020504117012024,
+      "learning_rate": 0.00019753912786691605,
+      "loss": 0.9525,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07292483272866493,
+      "eval_loss": 0.9407642483711243,
+      "eval_runtime": 61.0906,
+      "eval_samples_per_second": 146.586,
+      "eval_steps_per_second": 18.333,
+      "step": 1000
+    },
+    {
+      "epoch": 0.08021731600153142,
+      "grad_norm": 1.079444408416748,
+      "learning_rate": 0.00019704695344029924,
+      "loss": 0.9414,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08750979927439792,
+      "grad_norm": 1.057377576828003,
+      "learning_rate": 0.00019655477901368246,
+      "loss": 0.9231,
+      "step": 1200
+    },
+    {
+      "epoch": 0.0948022825472644,
+      "grad_norm": 1.068018913269043,
+      "learning_rate": 0.00019606260458706568,
+      "loss": 0.9168,
+      "step": 1300
+    },
+    {
+      "epoch": 0.1020947658201309,
+      "grad_norm": 0.9460920095443726,
+      "learning_rate": 0.00019557043016044887,
+      "loss": 0.9031,
+      "step": 1400
+    },
+    {
+      "epoch": 0.1093872490929974,
+      "grad_norm": 1.056226134300232,
+      "learning_rate": 0.00019507825573383206,
+      "loss": 0.8901,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11667973236586389,
+      "grad_norm": 1.0429835319519043,
+      "learning_rate": 0.00019458608130721528,
+      "loss": 0.8928,
+      "step": 1600
+    },
+    {
+      "epoch": 0.12397221563873038,
+      "grad_norm": 1.050790548324585,
+      "learning_rate": 0.0001940939068805985,
+      "loss": 0.8803,
+      "step": 1700
+    },
+    {
+      "epoch": 0.13126469891159687,
+      "grad_norm": 0.9586555361747742,
+      "learning_rate": 0.0001936017324539817,
+      "loss": 0.8809,
+      "step": 1800
+    },
+    {
+      "epoch": 0.13855718218446336,
+      "grad_norm": 0.985379159450531,
+      "learning_rate": 0.00019310955802736491,
+      "loss": 0.8743,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "grad_norm": 0.9307010769844055,
+      "learning_rate": 0.00019261738360074813,
+      "loss": 0.8727,
+      "step": 2000
+    },
+    {
+      "epoch": 0.14584966545732986,
+      "eval_loss": 0.86456698179245,
+      "eval_runtime": 60.6283,
+      "eval_samples_per_second": 147.703,
+      "eval_steps_per_second": 18.473,
+      "step": 2000
+    },
+    {
+      "epoch": 0.15314214873019635,
+      "grad_norm": 1.0384063720703125,
+      "learning_rate": 0.00019212520917413133,
+      "loss": 0.8742,
+      "step": 2100
+    },
+    {
+      "epoch": 0.16043463200306285,
+      "grad_norm": 0.9662402868270874,
+      "learning_rate": 0.00019163303474751452,
+      "loss": 0.8661,
+      "step": 2200
+    },
+    {
+      "epoch": 0.16772711527592934,
+      "grad_norm": 0.9773098230361938,
+      "learning_rate": 0.00019114086032089774,
+      "loss": 0.8576,
+      "step": 2300
+    },
+    {
+      "epoch": 0.17501959854879584,
+      "grad_norm": 0.9672012329101562,
+      "learning_rate": 0.00019064868589428093,
+      "loss": 0.8595,
+      "step": 2400
+    },
+    {
+      "epoch": 0.1823120818216623,
+      "grad_norm": 0.9758124351501465,
+      "learning_rate": 0.00019015651146766415,
+      "loss": 0.8524,
+      "step": 2500
+    },
+    {
+      "epoch": 0.1896045650945288,
+      "grad_norm": 0.972232460975647,
+      "learning_rate": 0.00018966433704104737,
+      "loss": 0.8468,
+      "step": 2600
+    },
+    {
+      "epoch": 0.1968970483673953,
+      "grad_norm": 0.9417553544044495,
+      "learning_rate": 0.00018917216261443056,
+      "loss": 0.8412,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2041895316402618,
+      "grad_norm": 0.9395071864128113,
+      "learning_rate": 0.00018867998818781375,
+      "loss": 0.8413,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2114820149131283,
+      "grad_norm": 0.9951208233833313,
+      "learning_rate": 0.000188187813761197,
+      "loss": 0.8345,
+      "step": 2900
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "grad_norm": 0.9656242728233337,
+      "learning_rate": 0.0001876956393345802,
+      "loss": 0.8317,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2187744981859948,
+      "eval_loss": 0.8318613767623901,
+      "eval_runtime": 61.1356,
+      "eval_samples_per_second": 146.478,
+      "eval_steps_per_second": 18.32,
+      "step": 3000
+    },
+    {
+      "epoch": 0.22606698145886128,
+      "grad_norm": 0.8810185194015503,
+      "learning_rate": 0.00018720346490796338,
+      "loss": 0.8321,
+      "step": 3100
+    },
+    {
+      "epoch": 0.23335946473172778,
+      "grad_norm": 0.9199262857437134,
+      "learning_rate": 0.0001867112904813466,
+      "loss": 0.8406,
+      "step": 3200
+    },
+    {
+      "epoch": 0.24065194800459427,
+      "grad_norm": 0.9557051658630371,
+      "learning_rate": 0.00018621911605472982,
+      "loss": 0.8277,
+      "step": 3300
+    },
+    {
+      "epoch": 0.24794443127746077,
+      "grad_norm": 0.9777804017066956,
+      "learning_rate": 0.000185726941628113,
+      "loss": 0.8272,
+      "step": 3400
+    },
+    {
+      "epoch": 0.25523691455032727,
+      "grad_norm": 0.8856322169303894,
+      "learning_rate": 0.00018523476720149623,
+      "loss": 0.8256,
+      "step": 3500
+    },
+    {
+      "epoch": 0.26252939782319373,
+      "grad_norm": 0.9196017980575562,
+      "learning_rate": 0.00018474259277487942,
+      "loss": 0.8234,
+      "step": 3600
+    },
+    {
+      "epoch": 0.26982188109606026,
+      "grad_norm": 0.9568464159965515,
+      "learning_rate": 0.00018425041834826264,
+      "loss": 0.8193,
+      "step": 3700
+    },
+    {
+      "epoch": 0.2771143643689267,
+      "grad_norm": 0.9552770256996155,
+      "learning_rate": 0.00018375824392164583,
+      "loss": 0.8179,
+      "step": 3800
+    },
+    {
+      "epoch": 0.28440684764179325,
+      "grad_norm": 0.8997077345848083,
+      "learning_rate": 0.00018326606949502905,
+      "loss": 0.8138,
+      "step": 3900
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "grad_norm": 0.8896480202674866,
+      "learning_rate": 0.00018277389506841224,
+      "loss": 0.8172,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2916993309146597,
+      "eval_loss": 0.8123040199279785,
+      "eval_runtime": 60.7914,
+      "eval_samples_per_second": 147.307,
+      "eval_steps_per_second": 18.424,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2989918141875262,
+      "grad_norm": 0.9520764350891113,
+      "learning_rate": 0.00018228172064179546,
+      "loss": 0.8183,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3062842974603927,
+      "grad_norm": 0.9373065233230591,
+      "learning_rate": 0.00018178954621517868,
+      "loss": 0.8132,
+      "step": 4200
+    },
+    {
+      "epoch": 0.3135767807332592,
+      "grad_norm": 0.8733066916465759,
+      "learning_rate": 0.00018129737178856187,
+      "loss": 0.811,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3208692640061257,
+      "grad_norm": 0.8866516351699829,
+      "learning_rate": 0.00018080519736194507,
+      "loss": 0.8093,
+      "step": 4400
+    },
+    {
+      "epoch": 0.32816174727899217,
+      "grad_norm": 0.9394953846931458,
+      "learning_rate": 0.00018031302293532828,
+      "loss": 0.8035,
+      "step": 4500
+    },
+    {
+      "epoch": 0.3354542305518587,
+      "grad_norm": 0.9133720993995667,
+      "learning_rate": 0.0001798208485087115,
+      "loss": 0.8054,
+      "step": 4600
+    },
+    {
+      "epoch": 0.34274671382472516,
+      "grad_norm": 0.9428606629371643,
+      "learning_rate": 0.0001793286740820947,
+      "loss": 0.8076,
+      "step": 4700
+    },
+    {
+      "epoch": 0.3500391970975917,
+      "grad_norm": 0.8996593356132507,
+      "learning_rate": 0.00017883649965547792,
+      "loss": 0.812,
+      "step": 4800
+    },
+    {
+      "epoch": 0.35733168037045815,
+      "grad_norm": 0.9113749265670776,
+      "learning_rate": 0.0001783443252288611,
+      "loss": 0.8048,
+      "step": 4900
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "grad_norm": 0.9185646176338196,
+      "learning_rate": 0.00017785215080224433,
+      "loss": 0.8023,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3646241636433246,
+      "eval_loss": 0.7973803877830505,
+      "eval_runtime": 60.8068,
+      "eval_samples_per_second": 147.27,
+      "eval_steps_per_second": 18.419,
+      "step": 5000
+    },
+    {
+      "epoch": 0.37191664691619114,
+      "grad_norm": 0.8994658589363098,
+      "learning_rate": 0.00017735997637562755,
+      "loss": 0.8089,
+      "step": 5100
+    },
+    {
+      "epoch": 0.3792091301890576,
+      "grad_norm": 0.8724523782730103,
+      "learning_rate": 0.00017686780194901074,
+      "loss": 0.8015,
+      "step": 5200
+    },
+    {
+      "epoch": 0.38650161346192413,
+      "grad_norm": 0.8285540342330933,
+      "learning_rate": 0.00017637562752239393,
+      "loss": 0.7944,
+      "step": 5300
+    },
+    {
+      "epoch": 0.3937940967347906,
+      "grad_norm": 0.8982509970664978,
+      "learning_rate": 0.00017588345309577718,
+      "loss": 0.7952,
+      "step": 5400
+    },
+    {
+      "epoch": 0.4010865800076571,
+      "grad_norm": 0.9266172051429749,
+      "learning_rate": 0.00017539127866916037,
+      "loss": 0.7978,
+      "step": 5500
+    },
+    {
+      "epoch": 0.4083790632805236,
+      "grad_norm": 0.901662290096283,
+      "learning_rate": 0.00017489910424254356,
+      "loss": 0.7966,
+      "step": 5600
+    },
+    {
+      "epoch": 0.4156715465533901,
+      "grad_norm": 0.9309051036834717,
+      "learning_rate": 0.00017440692981592678,
+      "loss": 0.7975,
+      "step": 5700
+    },
+    {
+      "epoch": 0.4229640298262566,
+      "grad_norm": 0.8789328336715698,
+      "learning_rate": 0.00017391475538930997,
+      "loss": 0.7997,
+      "step": 5800
+    },
+    {
+      "epoch": 0.4302565130991231,
+      "grad_norm": 0.8636139035224915,
+      "learning_rate": 0.0001734225809626932,
+      "loss": 0.7914,
+      "step": 5900
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "grad_norm": 0.9468287229537964,
+      "learning_rate": 0.00017293040653607638,
+      "loss": 0.7859,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4375489963719896,
+      "eval_loss": 0.7869976162910461,
+      "eval_runtime": 60.7741,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 6000
+    },
+    {
+      "epoch": 0.44484147964485604,
+      "grad_norm": 0.867158055305481,
+      "learning_rate": 0.0001724382321094596,
+      "loss": 0.7924,
+      "step": 6100
+    },
+    {
+      "epoch": 0.45213396291772256,
+      "grad_norm": 0.9379836320877075,
+      "learning_rate": 0.0001719460576828428,
+      "loss": 0.7902,
+      "step": 6200
+    },
+    {
+      "epoch": 0.45942644619058903,
+      "grad_norm": 0.8591951727867126,
+      "learning_rate": 0.000171453883256226,
+      "loss": 0.7926,
+      "step": 6300
+    },
+    {
+      "epoch": 0.46671892946345556,
+      "grad_norm": 0.9702317118644714,
+      "learning_rate": 0.00017096170882960923,
+      "loss": 0.7867,
+      "step": 6400
+    },
+    {
+      "epoch": 0.474011412736322,
+      "grad_norm": 0.902302086353302,
+      "learning_rate": 0.00017046953440299242,
+      "loss": 0.7897,
+      "step": 6500
+    },
+    {
+      "epoch": 0.48130389600918855,
+      "grad_norm": 0.889926552772522,
+      "learning_rate": 0.00016997735997637561,
+      "loss": 0.7857,
+      "step": 6600
+    },
+    {
+      "epoch": 0.488596379282055,
+      "grad_norm": 0.8906420469284058,
+      "learning_rate": 0.00016948518554975886,
+      "loss": 0.7878,
+      "step": 6700
+    },
+    {
+      "epoch": 0.49588886255492154,
+      "grad_norm": 0.919983983039856,
+      "learning_rate": 0.00016899301112314205,
+      "loss": 0.7876,
+      "step": 6800
+    },
+    {
+      "epoch": 0.5031813458277881,
+      "grad_norm": 0.8610624670982361,
+      "learning_rate": 0.00016850083669652524,
+      "loss": 0.7923,
+      "step": 6900
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "grad_norm": 0.9339637160301208,
+      "learning_rate": 0.00016800866226990846,
+      "loss": 0.7837,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5104738291006545,
+      "eval_loss": 0.7791191935539246,
+      "eval_runtime": 60.8878,
+      "eval_samples_per_second": 147.074,
+      "eval_steps_per_second": 18.395,
+      "step": 7000
+    },
+    {
+      "epoch": 0.517766312373521,
+      "grad_norm": 0.9073446393013,
+      "learning_rate": 0.00016751648784329168,
+      "loss": 0.7809,
+      "step": 7100
+    },
+    {
+      "epoch": 0.5250587956463875,
+      "grad_norm": 0.9348235726356506,
+      "learning_rate": 0.00016702431341667487,
+      "loss": 0.7793,
+      "step": 7200
+    },
+    {
+      "epoch": 0.5323512789192539,
+      "grad_norm": 0.9155163168907166,
+      "learning_rate": 0.0001665321389900581,
+      "loss": 0.7821,
+      "step": 7300
+    },
+    {
+      "epoch": 0.5396437621921205,
+      "grad_norm": 0.9328250885009766,
+      "learning_rate": 0.00016603996456344129,
+      "loss": 0.7806,
+      "step": 7400
+    },
+    {
+      "epoch": 0.546936245464987,
+      "grad_norm": 0.8911275863647461,
+      "learning_rate": 0.00016554779013682448,
+      "loss": 0.7782,
+      "step": 7500
+    },
+    {
+      "epoch": 0.5542287287378534,
+      "grad_norm": 0.8989250659942627,
+      "learning_rate": 0.00016505561571020772,
+      "loss": 0.779,
+      "step": 7600
+    },
+    {
+      "epoch": 0.5615212120107199,
+      "grad_norm": 0.8869723081588745,
+      "learning_rate": 0.00016456344128359092,
+      "loss": 0.7822,
+      "step": 7700
+    },
+    {
+      "epoch": 0.5688136952835865,
+      "grad_norm": 0.8631371259689331,
+      "learning_rate": 0.0001640712668569741,
+      "loss": 0.7768,
+      "step": 7800
+    },
+    {
+      "epoch": 0.576106178556453,
+      "grad_norm": 0.8868420720100403,
+      "learning_rate": 0.00016357909243035733,
+      "loss": 0.7834,
+      "step": 7900
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "grad_norm": 0.9253202080726624,
+      "learning_rate": 0.00016308691800374055,
+      "loss": 0.773,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5833986618293194,
+      "eval_loss": 0.7733862400054932,
+      "eval_runtime": 60.8911,
+      "eval_samples_per_second": 147.066,
+      "eval_steps_per_second": 18.394,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5906911451021859,
+      "grad_norm": 0.830760657787323,
+      "learning_rate": 0.00016259474357712374,
+      "loss": 0.7756,
+      "step": 8100
+    },
+    {
+      "epoch": 0.5979836283750524,
+      "grad_norm": 0.9371838569641113,
+      "learning_rate": 0.00016210256915050696,
+      "loss": 0.776,
+      "step": 8200
+    },
+    {
+      "epoch": 0.605276111647919,
+      "grad_norm": 0.8486947417259216,
+      "learning_rate": 0.00016161039472389015,
+      "loss": 0.7758,
+      "step": 8300
+    },
+    {
+      "epoch": 0.6125685949207854,
+      "grad_norm": 0.8888623118400574,
+      "learning_rate": 0.00016111822029727337,
+      "loss": 0.783,
+      "step": 8400
+    },
+    {
+      "epoch": 0.6198610781936519,
+      "grad_norm": 0.9176976084709167,
+      "learning_rate": 0.00016062604587065656,
+      "loss": 0.7782,
+      "step": 8500
+    },
+    {
+      "epoch": 0.6271535614665184,
+      "grad_norm": 0.90993732213974,
+      "learning_rate": 0.00016013387144403978,
+      "loss": 0.7741,
+      "step": 8600
+    },
+    {
+      "epoch": 0.6344460447393849,
+      "grad_norm": 0.8461544513702393,
+      "learning_rate": 0.00015964169701742297,
+      "loss": 0.7782,
+      "step": 8700
+    },
+    {
+      "epoch": 0.6417385280122514,
+      "grad_norm": 0.8642047643661499,
+      "learning_rate": 0.0001591495225908062,
+      "loss": 0.7706,
+      "step": 8800
+    },
+    {
+      "epoch": 0.6490310112851179,
+      "grad_norm": 0.8944571018218994,
+      "learning_rate": 0.0001586573481641894,
+      "loss": 0.7727,
+      "step": 8900
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "grad_norm": 0.9075286984443665,
+      "learning_rate": 0.0001581651737375726,
+      "loss": 0.7748,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6563234945579843,
+      "eval_loss": 0.7666329741477966,
+      "eval_runtime": 60.5924,
+      "eval_samples_per_second": 147.791,
+      "eval_steps_per_second": 18.484,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6636159778308508,
+      "grad_norm": 0.9164955615997314,
+      "learning_rate": 0.0001576729993109558,
+      "loss": 0.7792,
+      "step": 9100
+    },
+    {
+      "epoch": 0.6709084611037174,
+      "grad_norm": 0.8446054458618164,
+      "learning_rate": 0.000157180824884339,
+      "loss": 0.7661,
+      "step": 9200
+    },
+    {
+      "epoch": 0.6782009443765838,
+      "grad_norm": 0.8793991804122925,
+      "learning_rate": 0.00015668865045772223,
+      "loss": 0.7678,
+      "step": 9300
+    },
+    {
+      "epoch": 0.6854934276494503,
+      "grad_norm": 0.8772592544555664,
+      "learning_rate": 0.00015619647603110542,
+      "loss": 0.7708,
+      "step": 9400
+    },
+    {
+      "epoch": 0.6927859109223168,
+      "grad_norm": 0.854118824005127,
+      "learning_rate": 0.00015570430160448864,
+      "loss": 0.7616,
+      "step": 9500
+    },
+    {
+      "epoch": 0.7000783941951834,
+      "grad_norm": 0.8653910756111145,
+      "learning_rate": 0.00015521212717787183,
+      "loss": 0.767,
+      "step": 9600
+    },
+    {
+      "epoch": 0.7073708774680498,
+      "grad_norm": 0.8890120387077332,
+      "learning_rate": 0.00015471995275125505,
+      "loss": 0.7657,
+      "step": 9700
+    },
+    {
+      "epoch": 0.7146633607409163,
+      "grad_norm": 0.8451828360557556,
+      "learning_rate": 0.00015422777832463827,
+      "loss": 0.7656,
+      "step": 9800
+    },
+    {
+      "epoch": 0.7219558440137828,
+      "grad_norm": 0.9029329419136047,
+      "learning_rate": 0.00015373560389802146,
+      "loss": 0.7749,
+      "step": 9900
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "grad_norm": 0.8538834452629089,
+      "learning_rate": 0.00015324342947140466,
+      "loss": 0.763,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7292483272866492,
+      "eval_loss": 0.76123046875,
+      "eval_runtime": 60.847,
+      "eval_samples_per_second": 147.172,
+      "eval_steps_per_second": 18.407,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7365408105595158,
+      "grad_norm": 0.8594367504119873,
+      "learning_rate": 0.00015275125504478788,
+      "loss": 0.7693,
+      "step": 10100
+    },
+    {
+      "epoch": 0.7438332938323823,
+      "grad_norm": 0.8748040199279785,
+      "learning_rate": 0.0001522590806181711,
+      "loss": 0.7684,
+      "step": 10200
+    },
+    {
+      "epoch": 0.7511257771052487,
+      "grad_norm": 0.9177483320236206,
+      "learning_rate": 0.0001517669061915543,
+      "loss": 0.7599,
+      "step": 10300
+    },
+    {
+      "epoch": 0.7584182603781152,
+      "grad_norm": 0.8988757729530334,
+      "learning_rate": 0.0001512747317649375,
+      "loss": 0.7648,
+      "step": 10400
+    },
+    {
+      "epoch": 0.7657107436509818,
+      "grad_norm": 0.8735676407814026,
+      "learning_rate": 0.00015078255733832073,
+      "loss": 0.7656,
+      "step": 10500
+    },
+    {
+      "epoch": 0.7730032269238483,
+      "grad_norm": 0.8750614523887634,
+      "learning_rate": 0.00015029038291170392,
+      "loss": 0.7632,
+      "step": 10600
+    },
+    {
+      "epoch": 0.7802957101967147,
+      "grad_norm": 0.8786306381225586,
+      "learning_rate": 0.0001497982084850871,
+      "loss": 0.7659,
+      "step": 10700
+    },
+    {
+      "epoch": 0.7875881934695812,
+      "grad_norm": 0.811834990978241,
+      "learning_rate": 0.00014930603405847033,
+      "loss": 0.7652,
+      "step": 10800
+    },
+    {
+      "epoch": 0.7948806767424477,
+      "grad_norm": 0.8844282031059265,
+      "learning_rate": 0.00014881385963185352,
+      "loss": 0.7623,
+      "step": 10900
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "grad_norm": 0.8444844484329224,
+      "learning_rate": 0.00014832168520523674,
+      "loss": 0.7622,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8021731600153142,
+      "eval_loss": 0.75812828540802,
+      "eval_runtime": 60.7569,
+      "eval_samples_per_second": 147.391,
+      "eval_steps_per_second": 18.434,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8094656432881807,
+      "grad_norm": 0.8396947979927063,
+      "learning_rate": 0.00014782951077861996,
+      "loss": 0.7673,
+      "step": 11100
+    },
+    {
+      "epoch": 0.8167581265610472,
+      "grad_norm": 0.8890758752822876,
+      "learning_rate": 0.00014733733635200315,
+      "loss": 0.7551,
+      "step": 11200
+    },
+    {
+      "epoch": 0.8240506098339136,
+      "grad_norm": 0.8038908839225769,
+      "learning_rate": 0.00014684516192538634,
+      "loss": 0.7612,
+      "step": 11300
+    },
+    {
+      "epoch": 0.8313430931067802,
+      "grad_norm": 0.8224745392799377,
+      "learning_rate": 0.0001463529874987696,
+      "loss": 0.7618,
+      "step": 11400
+    },
+    {
+      "epoch": 0.8386355763796467,
+      "grad_norm": 0.8691264390945435,
+      "learning_rate": 0.00014586081307215278,
+      "loss": 0.7618,
+      "step": 11500
+    },
+    {
+      "epoch": 0.8459280596525132,
+      "grad_norm": 0.8442777395248413,
+      "learning_rate": 0.00014536863864553597,
+      "loss": 0.7671,
+      "step": 11600
+    },
+    {
+      "epoch": 0.8532205429253796,
+      "grad_norm": 0.8520532846450806,
+      "learning_rate": 0.0001448764642189192,
+      "loss": 0.7625,
+      "step": 11700
+    },
+    {
+      "epoch": 0.8605130261982462,
+      "grad_norm": 0.908760666847229,
+      "learning_rate": 0.0001443842897923024,
+      "loss": 0.7615,
+      "step": 11800
+    },
+    {
+      "epoch": 0.8678055094711127,
+      "grad_norm": 0.8004080057144165,
+      "learning_rate": 0.0001438921153656856,
+      "loss": 0.7632,
+      "step": 11900
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "grad_norm": 0.8449864983558655,
+      "learning_rate": 0.00014339994093906882,
+      "loss": 0.7574,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8750979927439791,
+      "eval_loss": 0.752128005027771,
+      "eval_runtime": 61.1399,
+      "eval_samples_per_second": 146.467,
+      "eval_steps_per_second": 18.319,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8823904760168456,
+      "grad_norm": 0.8218274116516113,
+      "learning_rate": 0.00014290776651245201,
+      "loss": 0.7555,
+      "step": 12100
+    },
+    {
+      "epoch": 0.8896829592897121,
+      "grad_norm": 0.8944920897483826,
+      "learning_rate": 0.00014241559208583523,
+      "loss": 0.7594,
+      "step": 12200
+    },
+    {
+      "epoch": 0.8969754425625787,
+      "grad_norm": 0.9254937767982483,
+      "learning_rate": 0.00014192341765921845,
+      "loss": 0.7598,
+      "step": 12300
+    },
+    {
+      "epoch": 0.9042679258354451,
+      "grad_norm": 0.8887091875076294,
+      "learning_rate": 0.00014143124323260164,
+      "loss": 0.7625,
+      "step": 12400
+    },
+    {
+      "epoch": 0.9115604091083116,
+      "grad_norm": 0.8478124737739563,
+      "learning_rate": 0.00014093906880598484,
+      "loss": 0.756,
+      "step": 12500
+    },
+    {
+      "epoch": 0.9188528923811781,
+      "grad_norm": 0.9377927780151367,
+      "learning_rate": 0.00014044689437936805,
+      "loss": 0.7606,
+      "step": 12600
+    },
+    {
+      "epoch": 0.9261453756540446,
+      "grad_norm": 0.838175892829895,
+      "learning_rate": 0.00013995471995275127,
+      "loss": 0.7605,
+      "step": 12700
+    },
+    {
+      "epoch": 0.9334378589269111,
+      "grad_norm": 0.8345216512680054,
+      "learning_rate": 0.00013946254552613447,
+      "loss": 0.7568,
+      "step": 12800
+    },
+    {
+      "epoch": 0.9407303421997776,
+      "grad_norm": 0.894477367401123,
+      "learning_rate": 0.00013897037109951766,
+      "loss": 0.7535,
+      "step": 12900
+    },
+    {
+      "epoch": 0.948022825472644,
+      "grad_norm": 0.849010169506073,
+      "learning_rate": 0.00013847819667290088,
+      "loss": 0.7465,
+      "step": 13000
+    },
+    {
+      "epoch": 0.948022825472644,
+      "eval_loss": 0.7492165565490723,
+      "eval_runtime": 60.7079,
+      "eval_samples_per_second": 147.51,
+      "eval_steps_per_second": 18.449,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9553153087455105,
+      "grad_norm": 0.8754207491874695,
+      "learning_rate": 0.0001379860222462841,
+      "loss": 0.7576,
+      "step": 13100
+    },
+    {
+      "epoch": 0.9626077920183771,
+      "grad_norm": 0.8984807133674622,
+      "learning_rate": 0.0001374938478196673,
+      "loss": 0.7493,
+      "step": 13200
+    },
+    {
+      "epoch": 0.9699002752912436,
+      "grad_norm": 0.8458361029624939,
+      "learning_rate": 0.0001370016733930505,
+      "loss": 0.7468,
+      "step": 13300
+    },
+    {
+      "epoch": 0.97719275856411,
+      "grad_norm": 0.9169609546661377,
+      "learning_rate": 0.0001365094989664337,
+      "loss": 0.7515,
+      "step": 13400
+    },
+    {
+      "epoch": 0.9844852418369765,
+      "grad_norm": 0.8027638792991638,
+      "learning_rate": 0.00013601732453981692,
+      "loss": 0.7551,
+      "step": 13500
+    },
+    {
+      "epoch": 0.9917777251098431,
+      "grad_norm": 0.8572927117347717,
+      "learning_rate": 0.00013552515011320014,
+      "loss": 0.7481,
+      "step": 13600
+    },
+    {
+      "epoch": 0.9990702083827095,
+      "grad_norm": 0.8624053001403809,
+      "learning_rate": 0.00013503297568658333,
+      "loss": 0.7481,
+      "step": 13700
+    },
+    {
+      "epoch": 1.0063991540719404,
+      "grad_norm": 0.8915347456932068,
+      "learning_rate": 0.00013454080125996652,
+      "loss": 0.7463,
+      "step": 13800
+    },
+    {
+      "epoch": 1.0136916373448068,
+      "grad_norm": 0.8233557939529419,
+      "learning_rate": 0.00013404862683334977,
+      "loss": 0.7398,
+      "step": 13900
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "grad_norm": 0.8467598557472229,
+      "learning_rate": 0.00013355645240673296,
+      "loss": 0.7402,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0209841206176733,
+      "eval_loss": 0.7458442449569702,
+      "eval_runtime": 60.6887,
+      "eval_samples_per_second": 147.556,
+      "eval_steps_per_second": 18.455,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0282766038905398,
+      "grad_norm": 0.852739691734314,
+      "learning_rate": 0.00013306427798011615,
+      "loss": 0.7436,
+      "step": 14100
+    },
+    {
+      "epoch": 1.0355690871634062,
+      "grad_norm": 0.8501101136207581,
+      "learning_rate": 0.00013257210355349937,
+      "loss": 0.7472,
+      "step": 14200
+    },
+    {
+      "epoch": 1.0428615704362727,
+      "grad_norm": 0.8830447793006897,
+      "learning_rate": 0.0001320799291268826,
+      "loss": 0.7438,
+      "step": 14300
+    },
+    {
+      "epoch": 1.0501540537091394,
+      "grad_norm": 0.8827272057533264,
+      "learning_rate": 0.00013158775470026578,
+      "loss": 0.7439,
+      "step": 14400
+    },
+    {
+      "epoch": 1.0574465369820059,
+      "grad_norm": 0.7875618934631348,
+      "learning_rate": 0.000131095580273649,
+      "loss": 0.7426,
+      "step": 14500
+    },
+    {
+      "epoch": 1.0647390202548723,
+      "grad_norm": 0.9906949996948242,
+      "learning_rate": 0.0001306034058470322,
+      "loss": 0.7418,
+      "step": 14600
+    },
+    {
+      "epoch": 1.0720315035277388,
+      "grad_norm": 0.8803852200508118,
+      "learning_rate": 0.00013011123142041538,
+      "loss": 0.7421,
+      "step": 14700
+    },
+    {
+      "epoch": 1.0793239868006053,
+      "grad_norm": 0.8951194286346436,
+      "learning_rate": 0.0001296190569937986,
+      "loss": 0.7429,
+      "step": 14800
+    },
+    {
+      "epoch": 1.0866164700734717,
+      "grad_norm": 0.8548495769500732,
+      "learning_rate": 0.00012912688256718182,
+      "loss": 0.7462,
+      "step": 14900
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "grad_norm": 0.9326722025871277,
+      "learning_rate": 0.00012863470814056501,
+      "loss": 0.7515,
+      "step": 15000
+    },
+    {
+      "epoch": 1.0939089533463382,
+      "eval_loss": 0.7423983812332153,
+      "eval_runtime": 61.1091,
+      "eval_samples_per_second": 146.541,
+      "eval_steps_per_second": 18.328,
+      "step": 15000
+    },
+    {
+      "epoch": 1.1012014366192047,
+      "grad_norm": 0.8803513646125793,
+      "learning_rate": 0.00012814253371394823,
+      "loss": 0.7369,
+      "step": 15100
+    },
+    {
+      "epoch": 1.1084939198920711,
+      "grad_norm": 0.8555076122283936,
+      "learning_rate": 0.00012765035928733145,
+      "loss": 0.7414,
+      "step": 15200
+    },
+    {
+      "epoch": 1.1157864031649378,
+      "grad_norm": 0.8760358691215515,
+      "learning_rate": 0.00012715818486071464,
+      "loss": 0.741,
+      "step": 15300
+    },
+    {
+      "epoch": 1.1230788864378043,
+      "grad_norm": 0.8444579839706421,
+      "learning_rate": 0.00012666601043409784,
+      "loss": 0.7448,
+      "step": 15400
+    },
+    {
+      "epoch": 1.1303713697106708,
+      "grad_norm": 0.8995528221130371,
+      "learning_rate": 0.00012617383600748106,
+      "loss": 0.7436,
+      "step": 15500
+    },
+    {
+      "epoch": 1.1376638529835372,
+      "grad_norm": 0.8966475129127502,
+      "learning_rate": 0.00012568166158086427,
+      "loss": 0.7485,
+      "step": 15600
+    },
+    {
+      "epoch": 1.1449563362564037,
+      "grad_norm": 0.8527953028678894,
+      "learning_rate": 0.00012518948715424747,
+      "loss": 0.7303,
+      "step": 15700
+    },
+    {
+      "epoch": 1.1522488195292702,
+      "grad_norm": 0.8657513856887817,
+      "learning_rate": 0.00012469731272763069,
+      "loss": 0.7431,
+      "step": 15800
+    },
+    {
+      "epoch": 1.1595413028021366,
+      "grad_norm": 0.8745185136795044,
+      "learning_rate": 0.00012420513830101388,
+      "loss": 0.7426,
+      "step": 15900
+    },
+    {
+      "epoch": 1.166833786075003,
+      "grad_norm": 0.8729378581047058,
+      "learning_rate": 0.0001237129638743971,
+      "loss": 0.7389,
+      "step": 16000
+    },
+    {
+      "epoch": 1.166833786075003,
+      "eval_loss": 0.740699291229248,
+      "eval_runtime": 60.635,
+      "eval_samples_per_second": 147.687,
+      "eval_steps_per_second": 18.471,
+      "step": 16000
+    },
+    {
+      "epoch": 1.1741262693478696,
+      "grad_norm": 0.8877021670341492,
+      "learning_rate": 0.00012322078944778032,
+      "loss": 0.7419,
+      "step": 16100
+    },
+    {
+      "epoch": 1.1814187526207363,
+      "grad_norm": 0.9095293283462524,
+      "learning_rate": 0.0001227286150211635,
+      "loss": 0.7365,
+      "step": 16200
+    },
+    {
+      "epoch": 1.1887112358936027,
+      "grad_norm": 0.8597880601882935,
+      "learning_rate": 0.0001222364405945467,
+      "loss": 0.7336,
+      "step": 16300
+    },
+    {
+      "epoch": 1.1960037191664692,
+      "grad_norm": 0.9574359059333801,
+      "learning_rate": 0.0001217442661679299,
+      "loss": 0.7394,
+      "step": 16400
+    },
+    {
+      "epoch": 1.2032962024393357,
+      "grad_norm": 0.8484875559806824,
+      "learning_rate": 0.00012125209174131314,
+      "loss": 0.7392,
+      "step": 16500
+    },
+    {
+      "epoch": 1.2105886857122021,
+      "grad_norm": 0.8847618699073792,
+      "learning_rate": 0.00012075991731469633,
+      "loss": 0.7427,
+      "step": 16600
+    },
+    {
+      "epoch": 1.2178811689850686,
+      "grad_norm": 0.8780632019042969,
+      "learning_rate": 0.00012026774288807954,
+      "loss": 0.7399,
+      "step": 16700
+    },
+    {
+      "epoch": 1.225173652257935,
+      "grad_norm": 0.8698965311050415,
+      "learning_rate": 0.00011977556846146274,
+      "loss": 0.7395,
+      "step": 16800
+    },
+    {
+      "epoch": 1.2324661355308015,
+      "grad_norm": 0.8717935085296631,
+      "learning_rate": 0.00011928339403484596,
+      "loss": 0.7404,
+      "step": 16900
+    },
+    {
+      "epoch": 1.239758618803668,
+      "grad_norm": 0.8375683426856995,
+      "learning_rate": 0.00011879121960822917,
+      "loss": 0.7405,
+      "step": 17000
+    },
+    {
+      "epoch": 1.239758618803668,
+      "eval_loss": 0.7371787428855896,
+      "eval_runtime": 60.9373,
+      "eval_samples_per_second": 146.954,
+      "eval_steps_per_second": 18.38,
+      "step": 17000
+    },
+    {
+      "epoch": 1.2470511020765347,
+      "grad_norm": 0.8756095170974731,
+      "learning_rate": 0.00011829904518161237,
+      "loss": 0.736,
+      "step": 17100
+    },
+    {
+      "epoch": 1.2543435853494012,
+      "grad_norm": 0.8513076901435852,
+      "learning_rate": 0.00011780687075499556,
+      "loss": 0.7399,
+      "step": 17200
+    },
+    {
+      "epoch": 1.2616360686222676,
+      "grad_norm": 0.8297843337059021,
+      "learning_rate": 0.0001173146963283788,
+      "loss": 0.7406,
+      "step": 17300
+    },
+    {
+      "epoch": 1.268928551895134,
+      "grad_norm": 0.8896269202232361,
+      "learning_rate": 0.00011682252190176199,
+      "loss": 0.7346,
+      "step": 17400
+    },
+    {
+      "epoch": 1.2762210351680006,
+      "grad_norm": 0.874168336391449,
+      "learning_rate": 0.0001163303474751452,
+      "loss": 0.736,
+      "step": 17500
+    },
+    {
+      "epoch": 1.283513518440867,
+      "grad_norm": 0.9101394414901733,
+      "learning_rate": 0.0001158381730485284,
+      "loss": 0.7376,
+      "step": 17600
+    },
+    {
+      "epoch": 1.2908060017137335,
+      "grad_norm": 0.9011333584785461,
+      "learning_rate": 0.00011534599862191162,
+      "loss": 0.7361,
+      "step": 17700
+    },
+    {
+      "epoch": 1.2980984849866002,
+      "grad_norm": 0.8839349746704102,
+      "learning_rate": 0.00011485382419529482,
+      "loss": 0.7373,
+      "step": 17800
+    },
+    {
+      "epoch": 1.3053909682594664,
+      "grad_norm": 0.830528974533081,
+      "learning_rate": 0.00011436164976867803,
+      "loss": 0.7336,
+      "step": 17900
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "grad_norm": 0.8777081370353699,
+      "learning_rate": 0.00011386947534206122,
+      "loss": 0.7379,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3126834515323331,
+      "eval_loss": 0.7359282970428467,
+      "eval_runtime": 60.8023,
+      "eval_samples_per_second": 147.281,
+      "eval_steps_per_second": 18.42,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3199759348051996,
+      "grad_norm": 0.8853510022163391,
+      "learning_rate": 0.00011337730091544443,
+      "loss": 0.7376,
+      "step": 18100
+    },
+    {
+      "epoch": 1.327268418078066,
+      "grad_norm": 0.9219810366630554,
+      "learning_rate": 0.00011288512648882766,
+      "loss": 0.7399,
+      "step": 18200
+    },
+    {
+      "epoch": 1.3345609013509325,
+      "grad_norm": 0.9233282208442688,
+      "learning_rate": 0.00011239295206221085,
+      "loss": 0.7399,
+      "step": 18300
+    },
+    {
+      "epoch": 1.341853384623799,
+      "grad_norm": 0.8359719514846802,
+      "learning_rate": 0.00011190077763559406,
+      "loss": 0.7366,
+      "step": 18400
+    },
+    {
+      "epoch": 1.3491458678966655,
+      "grad_norm": 0.8673479557037354,
+      "learning_rate": 0.00011140860320897726,
+      "loss": 0.7398,
+      "step": 18500
+    },
+    {
+      "epoch": 1.356438351169532,
+      "grad_norm": 0.8565610647201538,
+      "learning_rate": 0.00011091642878236048,
+      "loss": 0.7278,
+      "step": 18600
+    },
+    {
+      "epoch": 1.3637308344423986,
+      "grad_norm": 0.8547226190567017,
+      "learning_rate": 0.00011042425435574369,
+      "loss": 0.7381,
+      "step": 18700
+    },
+    {
+      "epoch": 1.3710233177152649,
+      "grad_norm": 0.897081732749939,
+      "learning_rate": 0.00010993207992912688,
+      "loss": 0.7339,
+      "step": 18800
+    },
+    {
+      "epoch": 1.3783158009881316,
+      "grad_norm": 0.8852410912513733,
+      "learning_rate": 0.00010943990550251008,
+      "loss": 0.7342,
+      "step": 18900
+    },
+    {
+      "epoch": 1.385608284260998,
+      "grad_norm": 0.9213690161705017,
+      "learning_rate": 0.00010894773107589332,
+      "loss": 0.7389,
+      "step": 19000
+    },
+    {
+      "epoch": 1.385608284260998,
+      "eval_loss": 0.7335625886917114,
+      "eval_runtime": 60.8231,
+      "eval_samples_per_second": 147.23,
+      "eval_steps_per_second": 18.414,
+      "step": 19000
+    },
+    {
+      "epoch": 1.3929007675338645,
+      "grad_norm": 0.8398423790931702,
+      "learning_rate": 0.00010845555664927651,
+      "loss": 0.7274,
+      "step": 19100
+    },
+    {
+      "epoch": 1.400193250806731,
+      "grad_norm": 0.8863806128501892,
+      "learning_rate": 0.00010796338222265971,
+      "loss": 0.7331,
+      "step": 19200
+    },
+    {
+      "epoch": 1.4074857340795974,
+      "grad_norm": 0.8836521506309509,
+      "learning_rate": 0.00010747120779604292,
+      "loss": 0.7334,
+      "step": 19300
+    },
+    {
+      "epoch": 1.414778217352464,
+      "grad_norm": 0.8278964757919312,
+      "learning_rate": 0.00010697903336942614,
+      "loss": 0.7281,
+      "step": 19400
+    },
+    {
+      "epoch": 1.4220707006253304,
+      "grad_norm": 0.8681420087814331,
+      "learning_rate": 0.00010648685894280934,
+      "loss": 0.7345,
+      "step": 19500
+    },
+    {
+      "epoch": 1.429363183898197,
+      "grad_norm": 0.8721694946289062,
+      "learning_rate": 0.00010599468451619255,
+      "loss": 0.7246,
+      "step": 19600
+    },
+    {
+      "epoch": 1.4366556671710633,
+      "grad_norm": 0.8880037665367126,
+      "learning_rate": 0.00010550251008957574,
+      "loss": 0.7321,
+      "step": 19700
+    },
+    {
+      "epoch": 1.44394815044393,
+      "grad_norm": 0.8522552251815796,
+      "learning_rate": 0.00010501033566295895,
+      "loss": 0.734,
+      "step": 19800
+    },
+    {
+      "epoch": 1.4512406337167965,
+      "grad_norm": 0.8816943168640137,
+      "learning_rate": 0.00010451816123634217,
+      "loss": 0.7333,
+      "step": 19900
+    },
+    {
+      "epoch": 1.458533116989663,
+      "grad_norm": 0.8068501949310303,
+      "learning_rate": 0.00010402598680972537,
+      "loss": 0.7267,
+      "step": 20000
+    },
+    {
+      "epoch": 1.458533116989663,
+      "eval_loss": 0.731645405292511,
+      "eval_runtime": 61.0998,
+      "eval_samples_per_second": 146.563,
+      "eval_steps_per_second": 18.331,
+      "step": 20000
+    },
+    {
+      "epoch": 1.4658256002625294,
+      "grad_norm": 0.8473337888717651,
+      "learning_rate": 0.00010353381238310858,
+      "loss": 0.7328,
+      "step": 20100
+    },
+    {
+      "epoch": 1.4731180835353959,
+      "grad_norm": 0.9009122252464294,
+      "learning_rate": 0.00010304163795649177,
+      "loss": 0.733,
+      "step": 20200
+    },
+    {
+      "epoch": 1.4804105668082623,
+      "grad_norm": 0.8225035667419434,
+      "learning_rate": 0.000102549463529875,
+      "loss": 0.7311,
+      "step": 20300
+    },
+    {
+      "epoch": 1.4877030500811288,
+      "grad_norm": 0.8552617430686951,
+      "learning_rate": 0.00010205728910325821,
+      "loss": 0.7282,
+      "step": 20400
+    },
+    {
+      "epoch": 1.4949955333539955,
+      "grad_norm": 0.8690235614776611,
+      "learning_rate": 0.0001015651146766414,
+      "loss": 0.7329,
+      "step": 20500
+    },
+    {
+      "epoch": 1.5022880166268617,
+      "grad_norm": 0.8566781878471375,
+      "learning_rate": 0.0001010729402500246,
+      "loss": 0.7358,
+      "step": 20600
+    },
+    {
+      "epoch": 1.5095804998997284,
+      "grad_norm": 0.9174933433532715,
+      "learning_rate": 0.00010058076582340782,
+      "loss": 0.7266,
+      "step": 20700
+    },
+    {
+      "epoch": 1.516872983172595,
+      "grad_norm": 0.9414506554603577,
+      "learning_rate": 0.00010008859139679103,
+      "loss": 0.7321,
+      "step": 20800
+    },
+    {
+      "epoch": 1.5241654664454614,
+      "grad_norm": 0.9433586001396179,
+      "learning_rate": 9.959641697017424e-05,
+      "loss": 0.7355,
+      "step": 20900
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "grad_norm": 0.8544315695762634,
+      "learning_rate": 9.910424254355744e-05,
+      "loss": 0.7313,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5314579497183278,
+      "eval_loss": 0.7285299301147461,
+      "eval_runtime": 60.6886,
+      "eval_samples_per_second": 147.557,
+      "eval_steps_per_second": 18.455,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5387504329911943,
+      "grad_norm": 0.893223762512207,
+      "learning_rate": 9.861206811694065e-05,
+      "loss": 0.7329,
+      "step": 21100
+    },
+    {
+      "epoch": 1.546042916264061,
+      "grad_norm": 0.8868634104728699,
+      "learning_rate": 9.811989369032387e-05,
+      "loss": 0.7276,
+      "step": 21200
+    },
+    {
+      "epoch": 1.5533353995369272,
+      "grad_norm": 0.8362566232681274,
+      "learning_rate": 9.762771926370706e-05,
+      "loss": 0.723,
+      "step": 21300
+    },
+    {
+      "epoch": 1.560627882809794,
+      "grad_norm": 0.8852083086967468,
+      "learning_rate": 9.713554483709026e-05,
+      "loss": 0.7281,
+      "step": 21400
+    },
+    {
+      "epoch": 1.5679203660826602,
+      "grad_norm": 0.8901813626289368,
+      "learning_rate": 9.664337041047348e-05,
+      "loss": 0.7307,
+      "step": 21500
+    },
+    {
+      "epoch": 1.5752128493555269,
+      "grad_norm": 0.8210172057151794,
+      "learning_rate": 9.615119598385667e-05,
+      "loss": 0.7245,
+      "step": 21600
+    },
+    {
+      "epoch": 1.5825053326283933,
+      "grad_norm": 0.8676414489746094,
+      "learning_rate": 9.56590215572399e-05,
+      "loss": 0.7294,
+      "step": 21700
+    },
+    {
+      "epoch": 1.5897978159012598,
+      "grad_norm": 0.8923740983009338,
+      "learning_rate": 9.51668471306231e-05,
+      "loss": 0.7242,
+      "step": 21800
+    },
+    {
+      "epoch": 1.5970902991741263,
+      "grad_norm": 0.8402920365333557,
+      "learning_rate": 9.46746727040063e-05,
+      "loss": 0.7258,
+      "step": 21900
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "grad_norm": 0.8525983691215515,
+      "learning_rate": 9.418249827738951e-05,
+      "loss": 0.7294,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6043827824469927,
+      "eval_loss": 0.7267495393753052,
+      "eval_runtime": 61.1086,
+      "eval_samples_per_second": 146.542,
+      "eval_steps_per_second": 18.328,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6116752657198594,
+      "grad_norm": 0.8605002164840698,
+      "learning_rate": 9.369032385077272e-05,
+      "loss": 0.7259,
+      "step": 22100
+    },
+    {
+      "epoch": 1.6189677489927257,
+      "grad_norm": 0.8606895208358765,
+      "learning_rate": 9.319814942415592e-05,
+      "loss": 0.7275,
+      "step": 22200
+    },
+    {
+      "epoch": 1.6262602322655924,
+      "grad_norm": 0.8824227452278137,
+      "learning_rate": 9.270597499753914e-05,
+      "loss": 0.7245,
+      "step": 22300
+    },
+    {
+      "epoch": 1.6335527155384586,
+      "grad_norm": 0.8670118451118469,
+      "learning_rate": 9.221380057092233e-05,
+      "loss": 0.719,
+      "step": 22400
+    },
+    {
+      "epoch": 1.6408451988113253,
+      "grad_norm": 0.92063307762146,
+      "learning_rate": 9.172162614430555e-05,
+      "loss": 0.7293,
+      "step": 22500
+    },
+    {
+      "epoch": 1.6481376820841918,
+      "grad_norm": 0.8425260782241821,
+      "learning_rate": 9.122945171768876e-05,
+      "loss": 0.728,
+      "step": 22600
+    },
+    {
+      "epoch": 1.6554301653570582,
+      "grad_norm": 0.9162302017211914,
+      "learning_rate": 9.073727729107196e-05,
+      "loss": 0.7265,
+      "step": 22700
+    },
+    {
+      "epoch": 1.6627226486299247,
+      "grad_norm": 0.8905067443847656,
+      "learning_rate": 9.024510286445517e-05,
+      "loss": 0.7256,
+      "step": 22800
+    },
+    {
+      "epoch": 1.6700151319027912,
+      "grad_norm": 0.874357283115387,
+      "learning_rate": 8.975292843783837e-05,
+      "loss": 0.7249,
+      "step": 22900
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "grad_norm": 0.842005729675293,
+      "learning_rate": 8.926075401122158e-05,
+      "loss": 0.7268,
+      "step": 23000
+    },
+    {
+      "epoch": 1.6773076151756579,
+      "eval_loss": 0.7241798639297485,
+      "eval_runtime": 60.7958,
+      "eval_samples_per_second": 147.296,
+      "eval_steps_per_second": 18.422,
+      "step": 23000
+    },
+    {
+      "epoch": 1.684600098448524,
+      "grad_norm": 0.8695193529129028,
+      "learning_rate": 8.876857958460478e-05,
+      "loss": 0.7262,
+      "step": 23100
+    },
+    {
+      "epoch": 1.6918925817213908,
+      "grad_norm": 0.8673058748245239,
+      "learning_rate": 8.827640515798799e-05,
+      "loss": 0.7303,
+      "step": 23200
+    },
+    {
+      "epoch": 1.699185064994257,
+      "grad_norm": 0.9276596307754517,
+      "learning_rate": 8.77842307313712e-05,
+      "loss": 0.729,
+      "step": 23300
+    },
+    {
+      "epoch": 1.7064775482671237,
+      "grad_norm": 0.8023722171783447,
+      "learning_rate": 8.729205630475441e-05,
+      "loss": 0.7212,
+      "step": 23400
+    },
+    {
+      "epoch": 1.7137700315399902,
+      "grad_norm": 0.910897433757782,
+      "learning_rate": 8.67998818781376e-05,
+      "loss": 0.7252,
+      "step": 23500
+    },
+    {
+      "epoch": 1.7210625148128567,
+      "grad_norm": 0.8714926838874817,
+      "learning_rate": 8.630770745152083e-05,
+      "loss": 0.7306,
+      "step": 23600
+    },
+    {
+      "epoch": 1.7283549980857231,
+      "grad_norm": 0.8875166773796082,
+      "learning_rate": 8.581553302490403e-05,
+      "loss": 0.7235,
+      "step": 23700
+    },
+    {
+      "epoch": 1.7356474813585896,
+      "grad_norm": 0.9132345914840698,
+      "learning_rate": 8.532335859828724e-05,
+      "loss": 0.7331,
+      "step": 23800
+    },
+    {
+      "epoch": 1.7429399646314563,
+      "grad_norm": 0.8562710285186768,
+      "learning_rate": 8.483118417167044e-05,
+      "loss": 0.7282,
+      "step": 23900
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "grad_norm": 0.867508590221405,
+      "learning_rate": 8.433900974505365e-05,
+      "loss": 0.7256,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7502324479043225,
+      "eval_loss": 0.7232645153999329,
+      "eval_runtime": 60.377,
+      "eval_samples_per_second": 148.318,
+      "eval_steps_per_second": 18.55,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7575249311771892,
+      "grad_norm": 0.8258200287818909,
+      "learning_rate": 8.384683531843685e-05,
+      "loss": 0.7254,
+      "step": 24100
+    },
+    {
+      "epoch": 1.7648174144500555,
+      "grad_norm": 0.9109018445014954,
+      "learning_rate": 8.335466089182007e-05,
+      "loss": 0.7315,
+      "step": 24200
+    },
+    {
+      "epoch": 1.7721098977229222,
+      "grad_norm": 0.8500842452049255,
+      "learning_rate": 8.286248646520326e-05,
+      "loss": 0.7265,
+      "step": 24300
+    },
+    {
+      "epoch": 1.7794023809957886,
+      "grad_norm": 0.9286713600158691,
+      "learning_rate": 8.237031203858648e-05,
+      "loss": 0.7247,
+      "step": 24400
+    },
+    {
+      "epoch": 1.786694864268655,
+      "grad_norm": 0.8746926188468933,
+      "learning_rate": 8.187813761196969e-05,
+      "loss": 0.7261,
+      "step": 24500
+    },
+    {
+      "epoch": 1.7939873475415216,
+      "grad_norm": 0.8702288866043091,
+      "learning_rate": 8.13859631853529e-05,
+      "loss": 0.7207,
+      "step": 24600
+    },
+    {
+      "epoch": 1.801279830814388,
+      "grad_norm": 0.9746344089508057,
+      "learning_rate": 8.08937887587361e-05,
+      "loss": 0.728,
+      "step": 24700
+    },
+    {
+      "epoch": 1.8085723140872547,
+      "grad_norm": 0.8815904259681702,
+      "learning_rate": 8.04016143321193e-05,
+      "loss": 0.7174,
+      "step": 24800
+    },
+    {
+      "epoch": 1.815864797360121,
+      "grad_norm": 0.870474100112915,
+      "learning_rate": 7.990943990550251e-05,
+      "loss": 0.7316,
+      "step": 24900
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "grad_norm": 0.8451401591300964,
+      "learning_rate": 7.941726547888572e-05,
+      "loss": 0.7202,
+      "step": 25000
+    },
+    {
+      "epoch": 1.8231572806329877,
+      "eval_loss": 0.721147358417511,
+      "eval_runtime": 60.8906,
+      "eval_samples_per_second": 147.067,
+      "eval_steps_per_second": 18.394,
+      "step": 25000
+    },
+    {
+      "epoch": 1.830449763905854,
+      "grad_norm": 0.8878180980682373,
+      "learning_rate": 7.892509105226894e-05,
+      "loss": 0.7236,
+      "step": 25100
+    },
+    {
+      "epoch": 1.8377422471787206,
+      "grad_norm": 0.859920859336853,
+      "learning_rate": 7.843291662565213e-05,
+      "loss": 0.7257,
+      "step": 25200
+    },
+    {
+      "epoch": 1.845034730451587,
+      "grad_norm": 0.9358228445053101,
+      "learning_rate": 7.794074219903535e-05,
+      "loss": 0.7175,
+      "step": 25300
+    },
+    {
+      "epoch": 1.8523272137244535,
+      "grad_norm": 0.858906626701355,
+      "learning_rate": 7.744856777241854e-05,
+      "loss": 0.7217,
+      "step": 25400
+    },
+    {
+      "epoch": 1.85961969699732,
+      "grad_norm": 0.9508287310600281,
+      "learning_rate": 7.695639334580176e-05,
+      "loss": 0.7211,
+      "step": 25500
+    },
+    {
+      "epoch": 1.8669121802701865,
+      "grad_norm": 0.9340062141418457,
+      "learning_rate": 7.646421891918496e-05,
+      "loss": 0.7254,
+      "step": 25600
+    },
+    {
+      "epoch": 1.8742046635430532,
+      "grad_norm": 0.9350687861442566,
+      "learning_rate": 7.597204449256817e-05,
+      "loss": 0.7247,
+      "step": 25700
+    },
+    {
+      "epoch": 1.8814971468159194,
+      "grad_norm": 0.9614841938018799,
+      "learning_rate": 7.547987006595137e-05,
+      "loss": 0.7283,
+      "step": 25800
+    },
+    {
+      "epoch": 1.888789630088786,
+      "grad_norm": 0.848640501499176,
+      "learning_rate": 7.49876956393346e-05,
+      "loss": 0.7221,
+      "step": 25900
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "grad_norm": 0.8105534315109253,
+      "learning_rate": 7.449552121271779e-05,
+      "loss": 0.7205,
+      "step": 26000
+    },
+    {
+      "epoch": 1.8960821133616523,
+      "eval_loss": 0.7193262577056885,
+      "eval_runtime": 61.1614,
+      "eval_samples_per_second": 146.416,
+      "eval_steps_per_second": 18.312,
+      "step": 26000
+    },
+    {
+      "epoch": 1.903374596634519,
+      "grad_norm": 0.8522207736968994,
+      "learning_rate": 7.4003346786101e-05,
+      "loss": 0.7223,
+      "step": 26100
+    },
+    {
+      "epoch": 1.9106670799073855,
+      "grad_norm": 0.8983740210533142,
+      "learning_rate": 7.351117235948421e-05,
+      "loss": 0.7208,
+      "step": 26200
+    },
+    {
+      "epoch": 1.917959563180252,
+      "grad_norm": 0.8596473336219788,
+      "learning_rate": 7.301899793286742e-05,
+      "loss": 0.7184,
+      "step": 26300
+    },
+    {
+      "epoch": 1.9252520464531184,
+      "grad_norm": 0.9175098538398743,
+      "learning_rate": 7.252682350625062e-05,
+      "loss": 0.7213,
+      "step": 26400
+    },
+    {
+      "epoch": 1.932544529725985,
+      "grad_norm": 0.8626872897148132,
+      "learning_rate": 7.203464907963383e-05,
+      "loss": 0.7242,
+      "step": 26500
+    },
+    {
+      "epoch": 1.9398370129988516,
+      "grad_norm": 0.859780490398407,
+      "learning_rate": 7.154247465301703e-05,
+      "loss": 0.7197,
+      "step": 26600
+    },
+    {
+      "epoch": 1.9471294962717178,
+      "grad_norm": 0.8713703751564026,
+      "learning_rate": 7.105030022640024e-05,
+      "loss": 0.7231,
+      "step": 26700
+    },
+    {
+      "epoch": 1.9544219795445845,
+      "grad_norm": 0.8976535797119141,
+      "learning_rate": 7.055812579978344e-05,
+      "loss": 0.7233,
+      "step": 26800
+    },
+    {
+      "epoch": 1.9617144628174508,
+      "grad_norm": 0.9257802367210388,
+      "learning_rate": 7.006595137316665e-05,
+      "loss": 0.7221,
+      "step": 26900
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "grad_norm": 0.8592785596847534,
+      "learning_rate": 6.957377694654987e-05,
+      "loss": 0.7168,
+      "step": 27000
+    },
+    {
+      "epoch": 1.9690069460903175,
+      "eval_loss": 0.7180259227752686,
+      "eval_runtime": 60.5352,
+      "eval_samples_per_second": 147.931,
+      "eval_steps_per_second": 18.502,
+      "step": 27000
+    },
+    {
+      "epoch": 1.976299429363184,
+      "grad_norm": 0.8931472897529602,
+      "learning_rate": 6.908160251993306e-05,
+      "loss": 0.7204,
+      "step": 27100
+    },
+    {
+      "epoch": 1.9835919126360504,
+      "grad_norm": 0.8821597695350647,
+      "learning_rate": 6.858942809331628e-05,
+      "loss": 0.7163,
+      "step": 27200
+    },
+    {
+      "epoch": 1.9908843959089169,
+      "grad_norm": 0.8749621510505676,
+      "learning_rate": 6.809725366669948e-05,
+      "loss": 0.711,
+      "step": 27300
+    },
+    {
+      "epoch": 1.9981768791817833,
+      "grad_norm": 0.903332531452179,
+      "learning_rate": 6.760507924008269e-05,
+      "loss": 0.7176,
+      "step": 27400
+    },
+    {
+      "epoch": 2.005505824871014,
+      "grad_norm": 0.854773759841919,
+      "learning_rate": 6.71129048134659e-05,
+      "loss": 0.7187,
+      "step": 27500
+    },
+    {
+      "epoch": 2.0127983081438807,
+      "grad_norm": 0.9489893913269043,
+      "learning_rate": 6.66207303868491e-05,
+      "loss": 0.7096,
+      "step": 27600
+    },
+    {
+      "epoch": 2.020090791416747,
+      "grad_norm": 0.8944621682167053,
+      "learning_rate": 6.61285559602323e-05,
+      "loss": 0.7104,
+      "step": 27700
+    },
+    {
+      "epoch": 2.0273832746896137,
+      "grad_norm": 0.8567011952400208,
+      "learning_rate": 6.563638153361553e-05,
+      "loss": 0.7124,
+      "step": 27800
+    },
+    {
+      "epoch": 2.0346757579624803,
+      "grad_norm": 0.8737155199050903,
+      "learning_rate": 6.514420710699872e-05,
+      "loss": 0.7127,
+      "step": 27900
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "grad_norm": 0.8935887813568115,
+      "learning_rate": 6.465203268038194e-05,
+      "loss": 0.7122,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0419682412353466,
+      "eval_loss": 0.716705858707428,
+      "eval_runtime": 60.7739,
+      "eval_samples_per_second": 147.349,
+      "eval_steps_per_second": 18.429,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0492607245082133,
+      "grad_norm": 0.9452987313270569,
+      "learning_rate": 6.415985825376514e-05,
+      "loss": 0.7112,
+      "step": 28100
+    },
+    {
+      "epoch": 2.0565532077810795,
+      "grad_norm": 0.8650675415992737,
+      "learning_rate": 6.366768382714833e-05,
+      "loss": 0.7079,
+      "step": 28200
+    },
+    {
+      "epoch": 2.063845691053946,
+      "grad_norm": 0.8913034796714783,
+      "learning_rate": 6.317550940053155e-05,
+      "loss": 0.713,
+      "step": 28300
+    },
+    {
+      "epoch": 2.0711381743268125,
+      "grad_norm": 0.9072710275650024,
+      "learning_rate": 6.268333497391476e-05,
+      "loss": 0.7094,
+      "step": 28400
+    },
+    {
+      "epoch": 2.078430657599679,
+      "grad_norm": 0.854245126247406,
+      "learning_rate": 6.219116054729796e-05,
+      "loss": 0.7077,
+      "step": 28500
+    },
+    {
+      "epoch": 2.0857231408725454,
+      "grad_norm": 0.929263174533844,
+      "learning_rate": 6.169898612068117e-05,
+      "loss": 0.7086,
+      "step": 28600
+    },
+    {
+      "epoch": 2.093015624145412,
+      "grad_norm": 0.9356215596199036,
+      "learning_rate": 6.120681169406438e-05,
+      "loss": 0.7157,
+      "step": 28700
+    },
+    {
+      "epoch": 2.100308107418279,
+      "grad_norm": 0.9242870211601257,
+      "learning_rate": 6.071463726744758e-05,
+      "loss": 0.71,
+      "step": 28800
+    },
+    {
+      "epoch": 2.107600590691145,
+      "grad_norm": 0.9065095782279968,
+      "learning_rate": 6.022246284083079e-05,
+      "loss": 0.7095,
+      "step": 28900
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "grad_norm": 0.9081276059150696,
+      "learning_rate": 5.9730288414214e-05,
+      "loss": 0.7096,
+      "step": 29000
+    },
+    {
+      "epoch": 2.1148930739640117,
+      "eval_loss": 0.7152244448661804,
+      "eval_runtime": 60.7986,
+      "eval_samples_per_second": 147.29,
+      "eval_steps_per_second": 18.421,
+      "step": 29000
+    },
+    {
+      "epoch": 2.122185557236878,
+      "grad_norm": 0.8326215744018555,
+      "learning_rate": 5.923811398759721e-05,
+      "loss": 0.7147,
+      "step": 29100
+    },
+    {
+      "epoch": 2.1294780405097447,
+      "grad_norm": 0.9274723529815674,
+      "learning_rate": 5.874593956098041e-05,
+      "loss": 0.7111,
+      "step": 29200
+    },
+    {
+      "epoch": 2.136770523782611,
+      "grad_norm": 0.8282331824302673,
+      "learning_rate": 5.825376513436362e-05,
+      "loss": 0.7137,
+      "step": 29300
+    },
+    {
+      "epoch": 2.1440630070554776,
+      "grad_norm": 0.9081612229347229,
+      "learning_rate": 5.776159070774683e-05,
+      "loss": 0.7115,
+      "step": 29400
+    },
+    {
+      "epoch": 2.151355490328344,
+      "grad_norm": 0.9531508684158325,
+      "learning_rate": 5.726941628113004e-05,
+      "loss": 0.708,
+      "step": 29500
+    },
+    {
+      "epoch": 2.1586479736012105,
+      "grad_norm": 0.9125275611877441,
+      "learning_rate": 5.677724185451324e-05,
+      "loss": 0.7123,
+      "step": 29600
+    },
+    {
+      "epoch": 2.165940456874077,
+      "grad_norm": 0.9363859295845032,
+      "learning_rate": 5.628506742789645e-05,
+      "loss": 0.7146,
+      "step": 29700
+    },
+    {
+      "epoch": 2.1732329401469435,
+      "grad_norm": 0.9164854884147644,
+      "learning_rate": 5.579289300127966e-05,
+      "loss": 0.7121,
+      "step": 29800
+    },
+    {
+      "epoch": 2.18052542341981,
+      "grad_norm": 0.941330075263977,
+      "learning_rate": 5.530071857466287e-05,
+      "loss": 0.7086,
+      "step": 29900
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "grad_norm": 0.9006567597389221,
+      "learning_rate": 5.480854414804607e-05,
+      "loss": 0.7097,
+      "step": 30000
+    },
+    {
+      "epoch": 2.1878179066926764,
+      "eval_loss": 0.7143043875694275,
+      "eval_runtime": 61.0555,
+      "eval_samples_per_second": 146.67,
+      "eval_steps_per_second": 18.344,
+      "step": 30000
+    },
+    {
+      "epoch": 2.195110389965543,
+      "grad_norm": 0.8913944363594055,
+      "learning_rate": 5.431636972142927e-05,
+      "loss": 0.7066,
+      "step": 30100
+    },
+    {
+      "epoch": 2.2024028732384093,
+      "grad_norm": 0.9200546145439148,
+      "learning_rate": 5.3824195294812486e-05,
+      "loss": 0.7076,
+      "step": 30200
+    },
+    {
+      "epoch": 2.209695356511276,
+      "grad_norm": 0.924148440361023,
+      "learning_rate": 5.3332020868195684e-05,
+      "loss": 0.7058,
+      "step": 30300
+    },
+    {
+      "epoch": 2.2169878397841423,
+      "grad_norm": 0.922255277633667,
+      "learning_rate": 5.2839846441578897e-05,
+      "loss": 0.7108,
+      "step": 30400
+    },
+    {
+      "epoch": 2.224280323057009,
+      "grad_norm": 0.9039818644523621,
+      "learning_rate": 5.23476720149621e-05,
+      "loss": 0.7091,
+      "step": 30500
+    },
+    {
+      "epoch": 2.2315728063298756,
+      "grad_norm": 0.963845431804657,
+      "learning_rate": 5.1855497588345314e-05,
+      "loss": 0.7065,
+      "step": 30600
+    },
+    {
+      "epoch": 2.238865289602742,
+      "grad_norm": 0.8838880658149719,
+      "learning_rate": 5.136332316172851e-05,
+      "loss": 0.7113,
+      "step": 30700
+    },
+    {
+      "epoch": 2.2461577728756086,
+      "grad_norm": 0.9642555117607117,
+      "learning_rate": 5.0871148735111725e-05,
+      "loss": 0.7062,
+      "step": 30800
+    },
+    {
+      "epoch": 2.253450256148475,
+      "grad_norm": 0.9088276624679565,
+      "learning_rate": 5.037897430849493e-05,
+      "loss": 0.7071,
+      "step": 30900
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "grad_norm": 0.9083282351493835,
+      "learning_rate": 4.9886799881878137e-05,
+      "loss": 0.7126,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2607427394213415,
+      "eval_loss": 0.7129958868026733,
+      "eval_runtime": 60.7821,
+      "eval_samples_per_second": 147.33,
+      "eval_steps_per_second": 18.426,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2680352226942078,
+      "grad_norm": 0.886710524559021,
+      "learning_rate": 4.939462545526134e-05,
+      "loss": 0.7043,
+      "step": 31100
+    },
+    {
+      "epoch": 2.2753277059670745,
+      "grad_norm": 0.8600069880485535,
+      "learning_rate": 4.8902451028644554e-05,
+      "loss": 0.7074,
+      "step": 31200
+    },
+    {
+      "epoch": 2.2826201892399407,
+      "grad_norm": 0.8897703289985657,
+      "learning_rate": 4.841027660202776e-05,
+      "loss": 0.7068,
+      "step": 31300
+    },
+    {
+      "epoch": 2.2899126725128074,
+      "grad_norm": 0.8638718724250793,
+      "learning_rate": 4.7918102175410965e-05,
+      "loss": 0.7062,
+      "step": 31400
+    },
+    {
+      "epoch": 2.297205155785674,
+      "grad_norm": 0.8973529934883118,
+      "learning_rate": 4.742592774879418e-05,
+      "loss": 0.7073,
+      "step": 31500
+    },
+    {
+      "epoch": 2.3044976390585403,
+      "grad_norm": 0.9759765267372131,
+      "learning_rate": 4.693375332217738e-05,
+      "loss": 0.7087,
+      "step": 31600
+    },
+    {
+      "epoch": 2.311790122331407,
+      "grad_norm": 0.9061428904533386,
+      "learning_rate": 4.644157889556059e-05,
+      "loss": 0.708,
+      "step": 31700
+    },
+    {
+      "epoch": 2.3190826056042733,
+      "grad_norm": 0.8808257579803467,
+      "learning_rate": 4.5949404468943794e-05,
+      "loss": 0.7086,
+      "step": 31800
+    },
+    {
+      "epoch": 2.32637508887714,
+      "grad_norm": 0.9116071462631226,
+      "learning_rate": 4.545723004232701e-05,
+      "loss": 0.7118,
+      "step": 31900
+    },
+    {
+      "epoch": 2.333667572150006,
+      "grad_norm": 0.9131873846054077,
+      "learning_rate": 4.496505561571021e-05,
+      "loss": 0.7043,
+      "step": 32000
+    },
+    {
+      "epoch": 2.333667572150006,
+      "eval_loss": 0.7112506031990051,
+      "eval_runtime": 61.1535,
+      "eval_samples_per_second": 146.435,
+      "eval_steps_per_second": 18.315,
+      "step": 32000
+    },
+    {
+      "epoch": 2.340960055422873,
+      "grad_norm": 0.9860331416130066,
+      "learning_rate": 4.447288118909342e-05,
+      "loss": 0.7063,
+      "step": 32100
+    },
+    {
+      "epoch": 2.348252538695739,
+      "grad_norm": 0.933958888053894,
+      "learning_rate": 4.398070676247662e-05,
+      "loss": 0.708,
+      "step": 32200
+    },
+    {
+      "epoch": 2.355545021968606,
+      "grad_norm": 0.8994225859642029,
+      "learning_rate": 4.3488532335859836e-05,
+      "loss": 0.7089,
+      "step": 32300
+    },
+    {
+      "epoch": 2.3628375052414725,
+      "grad_norm": 0.9435915946960449,
+      "learning_rate": 4.299635790924304e-05,
+      "loss": 0.7057,
+      "step": 32400
+    },
+    {
+      "epoch": 2.3701299885143388,
+      "grad_norm": 0.888438880443573,
+      "learning_rate": 4.2504183482626247e-05,
+      "loss": 0.7012,
+      "step": 32500
+    },
+    {
+      "epoch": 2.3774224717872054,
+      "grad_norm": 0.8772885799407959,
+      "learning_rate": 4.201200905600945e-05,
+      "loss": 0.7071,
+      "step": 32600
+    },
+    {
+      "epoch": 2.3847149550600717,
+      "grad_norm": 0.9333481788635254,
+      "learning_rate": 4.151983462939266e-05,
+      "loss": 0.7095,
+      "step": 32700
+    },
+    {
+      "epoch": 2.3920074383329384,
+      "grad_norm": 0.9497707486152649,
+      "learning_rate": 4.102766020277586e-05,
+      "loss": 0.7115,
+      "step": 32800
+    },
+    {
+      "epoch": 2.3992999216058046,
+      "grad_norm": 0.9641472697257996,
+      "learning_rate": 4.053548577615907e-05,
+      "loss": 0.712,
+      "step": 32900
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "grad_norm": 0.8958153128623962,
+      "learning_rate": 4.004331134954228e-05,
+      "loss": 0.7035,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4065924048786713,
+      "eval_loss": 0.7100856304168701,
+      "eval_runtime": 61.2325,
+      "eval_samples_per_second": 146.246,
+      "eval_steps_per_second": 18.291,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4138848881515376,
+      "grad_norm": 0.8818393349647522,
+      "learning_rate": 3.9551136922925487e-05,
+      "loss": 0.7052,
+      "step": 33100
+    },
+    {
+      "epoch": 2.4211773714244043,
+      "grad_norm": 0.8973012566566467,
+      "learning_rate": 3.905896249630869e-05,
+      "loss": 0.706,
+      "step": 33200
+    },
+    {
+      "epoch": 2.428469854697271,
+      "grad_norm": 0.8582873344421387,
+      "learning_rate": 3.85667880696919e-05,
+      "loss": 0.7088,
+      "step": 33300
+    },
+    {
+      "epoch": 2.435762337970137,
+      "grad_norm": 0.9306252002716064,
+      "learning_rate": 3.807461364307511e-05,
+      "loss": 0.7062,
+      "step": 33400
+    },
+    {
+      "epoch": 2.443054821243004,
+      "grad_norm": 0.8586992025375366,
+      "learning_rate": 3.7582439216458315e-05,
+      "loss": 0.7086,
+      "step": 33500
+    },
+    {
+      "epoch": 2.45034730451587,
+      "grad_norm": 0.9076369404792786,
+      "learning_rate": 3.709026478984152e-05,
+      "loss": 0.7052,
+      "step": 33600
+    },
+    {
+      "epoch": 2.457639787788737,
+      "grad_norm": 0.8954334855079651,
+      "learning_rate": 3.6598090363224727e-05,
+      "loss": 0.7082,
+      "step": 33700
+    },
+    {
+      "epoch": 2.464932271061603,
+      "grad_norm": 0.9315345287322998,
+      "learning_rate": 3.610591593660794e-05,
+      "loss": 0.7058,
+      "step": 33800
+    },
+    {
+      "epoch": 2.4722247543344698,
+      "grad_norm": 0.9223620295524597,
+      "learning_rate": 3.5613741509991144e-05,
+      "loss": 0.6992,
+      "step": 33900
+    },
+    {
+      "epoch": 2.479517237607336,
+      "grad_norm": 0.9349290132522583,
+      "learning_rate": 3.512156708337435e-05,
+      "loss": 0.7084,
+      "step": 34000
+    },
+    {
+      "epoch": 2.479517237607336,
+      "eval_loss": 0.7087690234184265,
+      "eval_runtime": 60.8859,
+      "eval_samples_per_second": 147.078,
+      "eval_steps_per_second": 18.395,
+      "step": 34000
+    },
+    {
+      "epoch": 2.4868097208802027,
+      "grad_norm": 0.883210301399231,
+      "learning_rate": 3.462939265675756e-05,
+      "loss": 0.7061,
+      "step": 34100
+    },
+    {
+      "epoch": 2.4941022041530694,
+      "grad_norm": 0.920868456363678,
+      "learning_rate": 3.413721823014077e-05,
+      "loss": 0.7069,
+      "step": 34200
+    },
+    {
+      "epoch": 2.5013946874259356,
+      "grad_norm": 0.9177393913269043,
+      "learning_rate": 3.3645043803523966e-05,
+      "loss": 0.7071,
+      "step": 34300
+    },
+    {
+      "epoch": 2.5086871706988023,
+      "grad_norm": 0.9114101529121399,
+      "learning_rate": 3.315286937690717e-05,
+      "loss": 0.7072,
+      "step": 34400
+    },
+    {
+      "epoch": 2.5159796539716686,
+      "grad_norm": 0.9645174145698547,
+      "learning_rate": 3.2660694950290384e-05,
+      "loss": 0.7028,
+      "step": 34500
+    },
+    {
+      "epoch": 2.5232721372445353,
+      "grad_norm": 0.8982295989990234,
+      "learning_rate": 3.216852052367359e-05,
+      "loss": 0.7085,
+      "step": 34600
+    },
+    {
+      "epoch": 2.530564620517402,
+      "grad_norm": 0.8964338898658752,
+      "learning_rate": 3.1676346097056795e-05,
+      "loss": 0.7069,
+      "step": 34700
+    },
+    {
+      "epoch": 2.537857103790268,
+      "grad_norm": 0.9609666466712952,
+      "learning_rate": 3.118417167044001e-05,
+      "loss": 0.7057,
+      "step": 34800
+    },
+    {
+      "epoch": 2.5451495870631344,
+      "grad_norm": 0.9131038188934326,
+      "learning_rate": 3.069199724382321e-05,
+      "loss": 0.7031,
+      "step": 34900
+    },
+    {
+      "epoch": 2.552442070336001,
+      "grad_norm": 0.9127321839332581,
+      "learning_rate": 3.019982281720642e-05,
+      "loss": 0.6979,
+      "step": 35000
+    },
+    {
+      "epoch": 2.552442070336001,
+      "eval_loss": 0.7076790928840637,
+      "eval_runtime": 61.0966,
+      "eval_samples_per_second": 146.571,
+      "eval_steps_per_second": 18.332,
+      "step": 35000
+    },
+    {
+      "epoch": 2.559734553608868,
+      "grad_norm": 0.9567495584487915,
+      "learning_rate": 2.9707648390589628e-05,
+      "loss": 0.7053,
+      "step": 35100
+    },
+    {
+      "epoch": 2.567027036881734,
+      "grad_norm": 0.9740573763847351,
+      "learning_rate": 2.9215473963972833e-05,
+      "loss": 0.7077,
+      "step": 35200
+    },
+    {
+      "epoch": 2.5743195201546007,
+      "grad_norm": 0.8982974886894226,
+      "learning_rate": 2.8723299537356042e-05,
+      "loss": 0.6983,
+      "step": 35300
+    },
+    {
+      "epoch": 2.581612003427467,
+      "grad_norm": 1.0185188055038452,
+      "learning_rate": 2.8231125110739248e-05,
+      "loss": 0.7069,
+      "step": 35400
+    },
+    {
+      "epoch": 2.5889044867003337,
+      "grad_norm": 0.94049471616745,
+      "learning_rate": 2.7738950684122457e-05,
+      "loss": 0.7054,
+      "step": 35500
+    },
+    {
+      "epoch": 2.5961969699732004,
+      "grad_norm": 0.8923749923706055,
+      "learning_rate": 2.7246776257505662e-05,
+      "loss": 0.7015,
+      "step": 35600
+    },
+    {
+      "epoch": 2.6034894532460666,
+      "grad_norm": 0.9568887948989868,
+      "learning_rate": 2.675460183088887e-05,
+      "loss": 0.7025,
+      "step": 35700
+    },
+    {
+      "epoch": 2.610781936518933,
+      "grad_norm": 0.9106321334838867,
+      "learning_rate": 2.6262427404272077e-05,
+      "loss": 0.7049,
+      "step": 35800
+    },
+    {
+      "epoch": 2.6180744197917996,
+      "grad_norm": 0.9499268531799316,
+      "learning_rate": 2.5770252977655285e-05,
+      "loss": 0.7021,
+      "step": 35900
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "grad_norm": 0.8965421915054321,
+      "learning_rate": 2.5278078551038488e-05,
+      "loss": 0.7036,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6253669030646662,
+      "eval_loss": 0.7065343856811523,
+      "eval_runtime": 61.0446,
+      "eval_samples_per_second": 146.696,
+      "eval_steps_per_second": 18.347,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6326593863375325,
+      "grad_norm": 0.94576096534729,
+      "learning_rate": 2.4785904124421696e-05,
+      "loss": 0.71,
+      "step": 36100
+    },
+    {
+      "epoch": 2.639951869610399,
+      "grad_norm": 0.962692141532898,
+      "learning_rate": 2.4293729697804905e-05,
+      "loss": 0.6953,
+      "step": 36200
+    },
+    {
+      "epoch": 2.6472443528832654,
+      "grad_norm": 0.9457094669342041,
+      "learning_rate": 2.380155527118811e-05,
+      "loss": 0.7011,
+      "step": 36300
+    },
+    {
+      "epoch": 2.654536836156132,
+      "grad_norm": 0.9523045420646667,
+      "learning_rate": 2.330938084457132e-05,
+      "loss": 0.7093,
+      "step": 36400
+    },
+    {
+      "epoch": 2.661829319428999,
+      "grad_norm": 0.9255204796791077,
+      "learning_rate": 2.2817206417954522e-05,
+      "loss": 0.6979,
+      "step": 36500
+    },
+    {
+      "epoch": 2.669121802701865,
+      "grad_norm": 1.015286922454834,
+      "learning_rate": 2.232503199133773e-05,
+      "loss": 0.7044,
+      "step": 36600
+    },
+    {
+      "epoch": 2.6764142859747313,
+      "grad_norm": 0.8911315202713013,
+      "learning_rate": 2.1832857564720936e-05,
+      "loss": 0.7031,
+      "step": 36700
+    },
+    {
+      "epoch": 2.683706769247598,
+      "grad_norm": 0.9372689127922058,
+      "learning_rate": 2.1340683138104145e-05,
+      "loss": 0.7019,
+      "step": 36800
+    },
+    {
+      "epoch": 2.6909992525204647,
+      "grad_norm": 0.9245051145553589,
+      "learning_rate": 2.084850871148735e-05,
+      "loss": 0.7065,
+      "step": 36900
+    },
+    {
+      "epoch": 2.698291735793331,
+      "grad_norm": 0.917607843875885,
+      "learning_rate": 2.035633428487056e-05,
+      "loss": 0.7016,
+      "step": 37000
+    },
+    {
+      "epoch": 2.698291735793331,
+      "eval_loss": 0.7054994702339172,
+      "eval_runtime": 60.6541,
+      "eval_samples_per_second": 147.64,
+      "eval_steps_per_second": 18.465,
+      "step": 37000
+    },
+    {
+      "epoch": 2.7055842190661976,
+      "grad_norm": 0.9054610729217529,
+      "learning_rate": 1.9864159858253765e-05,
+      "loss": 0.7034,
+      "step": 37100
+    },
+    {
+      "epoch": 2.712876702339064,
+      "grad_norm": 0.960075855255127,
+      "learning_rate": 1.9371985431636974e-05,
+      "loss": 0.7097,
+      "step": 37200
+    },
+    {
+      "epoch": 2.7201691856119306,
+      "grad_norm": 0.9454420208930969,
+      "learning_rate": 1.887981100502018e-05,
+      "loss": 0.7046,
+      "step": 37300
+    },
+    {
+      "epoch": 2.7274616688847972,
+      "grad_norm": 0.8761453628540039,
+      "learning_rate": 1.8387636578403385e-05,
+      "loss": 0.7068,
+      "step": 37400
+    },
+    {
+      "epoch": 2.7347541521576635,
+      "grad_norm": 0.9231957793235779,
+      "learning_rate": 1.7895462151786594e-05,
+      "loss": 0.6983,
+      "step": 37500
+    },
+    {
+      "epoch": 2.7420466354305297,
+      "grad_norm": 0.8630309104919434,
+      "learning_rate": 1.74032877251698e-05,
+      "loss": 0.6984,
+      "step": 37600
+    },
+    {
+      "epoch": 2.7493391187033964,
+      "grad_norm": 0.9077728986740112,
+      "learning_rate": 1.691111329855301e-05,
+      "loss": 0.7097,
+      "step": 37700
+    },
+    {
+      "epoch": 2.756631601976263,
+      "grad_norm": 0.9849316477775574,
+      "learning_rate": 1.6418938871936214e-05,
+      "loss": 0.7025,
+      "step": 37800
+    },
+    {
+      "epoch": 2.7639240852491294,
+      "grad_norm": 0.9101927280426025,
+      "learning_rate": 1.5926764445319423e-05,
+      "loss": 0.7127,
+      "step": 37900
+    },
+    {
+      "epoch": 2.771216568521996,
+      "grad_norm": 0.9624613523483276,
+      "learning_rate": 1.543459001870263e-05,
+      "loss": 0.7038,
+      "step": 38000
+    },
+    {
+      "epoch": 2.771216568521996,
+      "eval_loss": 0.7042670845985413,
+      "eval_runtime": 60.6288,
+      "eval_samples_per_second": 147.702,
+      "eval_steps_per_second": 18.473,
+      "step": 38000
+    },
+    {
+      "epoch": 2.7785090517948623,
+      "grad_norm": 0.8926946520805359,
+      "learning_rate": 1.4942415592085838e-05,
+      "loss": 0.6955,
+      "step": 38100
+    },
+    {
+      "epoch": 2.785801535067729,
+      "grad_norm": 0.9353916645050049,
+      "learning_rate": 1.4450241165469041e-05,
+      "loss": 0.7003,
+      "step": 38200
+    },
+    {
+      "epoch": 2.7930940183405957,
+      "grad_norm": 0.9394625425338745,
+      "learning_rate": 1.3958066738852249e-05,
+      "loss": 0.6963,
+      "step": 38300
+    },
+    {
+      "epoch": 2.800386501613462,
+      "grad_norm": 0.8811284303665161,
+      "learning_rate": 1.3465892312235456e-05,
+      "loss": 0.7057,
+      "step": 38400
+    },
+    {
+      "epoch": 2.807678984886328,
+      "grad_norm": 0.9111167788505554,
+      "learning_rate": 1.2973717885618663e-05,
+      "loss": 0.6905,
+      "step": 38500
+    },
+    {
+      "epoch": 2.814971468159195,
+      "grad_norm": 0.9061198830604553,
+      "learning_rate": 1.248154345900187e-05,
+      "loss": 0.6966,
+      "step": 38600
+    },
+    {
+      "epoch": 2.8222639514320615,
+      "grad_norm": 0.917921781539917,
+      "learning_rate": 1.1989369032385078e-05,
+      "loss": 0.7055,
+      "step": 38700
+    },
+    {
+      "epoch": 2.829556434704928,
+      "grad_norm": 0.9210913777351379,
+      "learning_rate": 1.1497194605768285e-05,
+      "loss": 0.7004,
+      "step": 38800
+    },
+    {
+      "epoch": 2.8368489179777945,
+      "grad_norm": 0.9152899384498596,
+      "learning_rate": 1.1005020179151492e-05,
+      "loss": 0.7065,
+      "step": 38900
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "grad_norm": 0.9237668514251709,
+      "learning_rate": 1.05128457525347e-05,
+      "loss": 0.7027,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8441414012506607,
+      "eval_loss": 0.7034493088722229,
+      "eval_runtime": 60.6775,
+      "eval_samples_per_second": 147.583,
+      "eval_steps_per_second": 18.458,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8514338845235274,
+      "grad_norm": 0.9577778577804565,
+      "learning_rate": 1.0020671325917906e-05,
+      "loss": 0.7064,
+      "step": 39100
+    },
+    {
+      "epoch": 2.858726367796394,
+      "grad_norm": 0.9955913424491882,
+      "learning_rate": 9.528496899301114e-06,
+      "loss": 0.7017,
+      "step": 39200
+    },
+    {
+      "epoch": 2.8660188510692604,
+      "grad_norm": 0.9187660217285156,
+      "learning_rate": 9.03632247268432e-06,
+      "loss": 0.6998,
+      "step": 39300
+    },
+    {
+      "epoch": 2.8733113343421266,
+      "grad_norm": 0.9275550842285156,
+      "learning_rate": 8.544148046067526e-06,
+      "loss": 0.7002,
+      "step": 39400
+    },
+    {
+      "epoch": 2.8806038176149933,
+      "grad_norm": 0.9114721417427063,
+      "learning_rate": 8.051973619450734e-06,
+      "loss": 0.7027,
+      "step": 39500
+    },
+    {
+      "epoch": 2.88789630088786,
+      "grad_norm": 0.9408327341079712,
+      "learning_rate": 7.559799192833941e-06,
+      "loss": 0.7034,
+      "step": 39600
+    },
+    {
+      "epoch": 2.8951887841607262,
+      "grad_norm": 0.9538366198539734,
+      "learning_rate": 7.067624766217147e-06,
+      "loss": 0.7007,
+      "step": 39700
+    },
+    {
+      "epoch": 2.902481267433593,
+      "grad_norm": 0.923864483833313,
+      "learning_rate": 6.5754503396003544e-06,
+      "loss": 0.6972,
+      "step": 39800
+    },
+    {
+      "epoch": 2.909773750706459,
+      "grad_norm": 0.9156636595726013,
+      "learning_rate": 6.083275912983562e-06,
+      "loss": 0.7064,
+      "step": 39900
+    },
+    {
+      "epoch": 2.917066233979326,
+      "grad_norm": 0.9568312168121338,
+      "learning_rate": 5.591101486366768e-06,
+      "loss": 0.6969,
+      "step": 40000
+    },
+    {
+      "epoch": 2.917066233979326,
+      "eval_loss": 0.7027888894081116,
+      "eval_runtime": 61.1155,
+      "eval_samples_per_second": 146.526,
+      "eval_steps_per_second": 18.326,
+      "step": 40000
+    },
+    {
+      "epoch": 2.9243587172521925,
+      "grad_norm": 0.9376012086868286,
+      "learning_rate": 5.098927059749975e-06,
+      "loss": 0.7,
+      "step": 40100
+    },
+    {
+      "epoch": 2.931651200525059,
+      "grad_norm": 0.9648913145065308,
+      "learning_rate": 4.6067526331331825e-06,
+      "loss": 0.7042,
+      "step": 40200
+    },
+    {
+      "epoch": 2.938943683797925,
+      "grad_norm": 0.9452090263366699,
+      "learning_rate": 4.11457820651639e-06,
+      "loss": 0.7041,
+      "step": 40300
+    },
+    {
+      "epoch": 2.9462361670707917,
+      "grad_norm": 0.9553784728050232,
+      "learning_rate": 3.622403779899597e-06,
+      "loss": 0.7005,
+      "step": 40400
+    },
+    {
+      "epoch": 2.9535286503436584,
+      "grad_norm": 0.8788447380065918,
+      "learning_rate": 3.1302293532828033e-06,
+      "loss": 0.6974,
+      "step": 40500
+    },
+    {
+      "epoch": 2.9608211336165247,
+      "grad_norm": 0.9146846532821655,
+      "learning_rate": 2.6380549266660105e-06,
+      "loss": 0.7004,
+      "step": 40600
+    },
+    {
+      "epoch": 2.9681136168893913,
+      "grad_norm": 0.9674293398857117,
+      "learning_rate": 2.1458805000492173e-06,
+      "loss": 0.7028,
+      "step": 40700
+    },
+    {
+      "epoch": 2.9754061001622576,
+      "grad_norm": 0.9374125599861145,
+      "learning_rate": 1.6537060734324243e-06,
+      "loss": 0.7008,
+      "step": 40800
+    },
+    {
+      "epoch": 2.9826985834351243,
+      "grad_norm": 0.9554013013839722,
+      "learning_rate": 1.1615316468156316e-06,
+      "loss": 0.7011,
+      "step": 40900
+    },
+    {
+      "epoch": 2.989991066707991,
+      "grad_norm": 0.8910831212997437,
+      "learning_rate": 6.693572201988385e-07,
+      "loss": 0.6992,
+      "step": 41000
+    },
+    {
+      "epoch": 2.989991066707991,
+      "eval_loss": 0.7023043632507324,
+      "eval_runtime": 61.2519,
+      "eval_samples_per_second": 146.2,
+      "eval_steps_per_second": 18.285,
+      "step": 41000
+    },
+    {
+      "epoch": 2.997283549980857,
+      "grad_norm": 0.9466680288314819,
+      "learning_rate": 1.771827935820455e-07,
+      "loss": 0.6961,
+      "step": 41100
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 41136,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.280129344536576e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-41136/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
+size 5713

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:047cc1002ac6795c1352776b646cdcd785be6cba5fd35ccec8909d0672eae7e5
+size 11418541

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
+size 5713