Shion1124 commited on Mar 19

Commit

327abab

verified ·

1 Parent(s): 4c366f1

Upload VLM + LoRA + Agentic RAG model with comprehensive docs

Browse files

Files changed (22) hide show

adapter_config.json +37 -1
adapter_model.safetensors +3 -0
checkpoint-2950/README.md +207 -0
checkpoint-2950/adapter_config.json +40 -0
checkpoint-2950/adapter_model.safetensors +3 -0
checkpoint-2950/optimizer.pt +3 -0
checkpoint-2950/rng_state.pth +3 -0
checkpoint-2950/scheduler.pt +3 -0
checkpoint-2950/tokenizer.json +0 -0
checkpoint-2950/tokenizer_config.json +12 -0
checkpoint-2950/trainer_state.json +2099 -0
checkpoint-2950/training_args.bin +3 -0
checkpoint-3000/README.md +207 -0
checkpoint-3000/adapter_config.json +40 -0
checkpoint-3000/adapter_model.safetensors +3 -0
checkpoint-3000/optimizer.pt +3 -0
checkpoint-3000/rng_state.pth +3 -0
checkpoint-3000/scheduler.pt +3 -0
checkpoint-3000/tokenizer.json +0 -0
checkpoint-3000/tokenizer_config.json +12 -0
checkpoint-3000/trainer_state.json +2134 -0
checkpoint-3000/training_args.bin +3 -0

adapter_config.json CHANGED Viewed

@@ -1,4 +1,40 @@
 {
   "r": 64,
-  "lora_alpha": 128
 }

 {
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "gpt2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": true,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
   "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_attn"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
 }

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daf96ecad4fb2c407b82f85d56cef90e2882f3ad4e8696d095a21353f66fa596
+size 9440280

checkpoint-2950/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: gpt2
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:gpt2
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-2950/adapter_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "gpt2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": true,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_attn"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-2950/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02a4e2f8d367d441cbc3ab081924cfbca9ee955a0978a62e9950353910fa6e7c
+size 9440280

checkpoint-2950/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9afb55e0f724d77e019d9bece3035817d197dc3cf3748242dd2bde537db07957
+size 4824013

checkpoint-2950/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a6e9c393552c2c47dfa2bcd8bd21d1fff3350161c69c2330a5e43e7a0ac17e7
+size 14645

checkpoint-2950/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe1ba50f341015e6963ecf5d9ab547ed57a294d162504c90954d845f834dd865
+size 1465

checkpoint-2950/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2950/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "is_local": false,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2950/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2099 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9833333333333333,
+  "eval_steps": 500,
+  "global_step": 2950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0033333333333333335,
+      "grad_norm": 6.809678077697754,
+      "learning_rate": 4.9850000000000006e-05,
+      "loss": 7.233219146728516,
+      "step": 10
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 8.315083503723145,
+      "learning_rate": 4.968333333333334e-05,
+      "loss": 5.7482860565185545,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.454437255859375,
+      "learning_rate": 4.9516666666666666e-05,
+      "loss": 3.6689868927001954,
+      "step": 30
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 3.284022092819214,
+      "learning_rate": 4.935e-05,
+      "loss": 2.273424530029297,
+      "step": 40
+    },
+    {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 0.9964243173599243,
+      "learning_rate": 4.9183333333333334e-05,
+      "loss": 1.6669052124023438,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6209345459938049,
+      "learning_rate": 4.901666666666667e-05,
+      "loss": 1.347894287109375,
+      "step": 60
+    },
+    {
+      "epoch": 0.023333333333333334,
+      "grad_norm": 0.7051726579666138,
+      "learning_rate": 4.885e-05,
+      "loss": 1.3426547050476074,
+      "step": 70
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5769520998001099,
+      "learning_rate": 4.8683333333333335e-05,
+      "loss": 1.031618881225586,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.3832005262374878,
+      "learning_rate": 4.851666666666667e-05,
+      "loss": 1.0876376152038574,
+      "step": 90
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 0.423240602016449,
+      "learning_rate": 4.835e-05,
+      "loss": 0.967049503326416,
+      "step": 100
+    },
+    {
+      "epoch": 0.03666666666666667,
+      "grad_norm": 1.0171490907669067,
+      "learning_rate": 4.818333333333334e-05,
+      "loss": 1.0151338577270508,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.42430293560028076,
+      "learning_rate": 4.801666666666667e-05,
+      "loss": 1.0364399909973145,
+      "step": 120
+    },
+    {
+      "epoch": 0.043333333333333335,
+      "grad_norm": 1.113786220550537,
+      "learning_rate": 4.785e-05,
+      "loss": 1.2358755111694335,
+      "step": 130
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 0.4222952425479889,
+      "learning_rate": 4.768333333333334e-05,
+      "loss": 0.9435253143310547,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.4197883903980255,
+      "learning_rate": 4.751666666666667e-05,
+      "loss": 0.9088167190551758,
+      "step": 150
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4791605770587921,
+      "learning_rate": 4.735e-05,
+      "loss": 1.0109647750854491,
+      "step": 160
+    },
+    {
+      "epoch": 0.056666666666666664,
+      "grad_norm": 0.4866371154785156,
+      "learning_rate": 4.718333333333333e-05,
+      "loss": 1.0510098457336425,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.369156152009964,
+      "learning_rate": 4.701666666666667e-05,
+      "loss": 0.9953393936157227,
+      "step": 180
+    },
+    {
+      "epoch": 0.06333333333333334,
+      "grad_norm": 0.8538883924484253,
+      "learning_rate": 4.685000000000001e-05,
+      "loss": 1.0944287300109863,
+      "step": 190
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4323364496231079,
+      "learning_rate": 4.6683333333333334e-05,
+      "loss": 0.9442741394042968,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5124598145484924,
+      "learning_rate": 4.651666666666667e-05,
+      "loss": 0.8857268333435059,
+      "step": 210
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 0.5820016264915466,
+      "learning_rate": 4.635e-05,
+      "loss": 1.1653939247131349,
+      "step": 220
+    },
+    {
+      "epoch": 0.07666666666666666,
+      "grad_norm": 0.3909580111503601,
+      "learning_rate": 4.6183333333333336e-05,
+      "loss": 1.0361078262329102,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9888875484466553,
+      "learning_rate": 4.601666666666667e-05,
+      "loss": 1.1712039947509765,
+      "step": 240
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.5135601758956909,
+      "learning_rate": 4.585e-05,
+      "loss": 0.9313525199890137,
+      "step": 250
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 0.5219751000404358,
+      "learning_rate": 4.568333333333333e-05,
+      "loss": 1.0194238662719726,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5085497498512268,
+      "learning_rate": 4.551666666666667e-05,
+      "loss": 0.9100503921508789,
+      "step": 270
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.39039579033851624,
+      "learning_rate": 4.5350000000000005e-05,
+      "loss": 0.9138201713562012,
+      "step": 280
+    },
+    {
+      "epoch": 0.09666666666666666,
+      "grad_norm": 0.4246252179145813,
+      "learning_rate": 4.518333333333333e-05,
+      "loss": 1.0014129638671876,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.46258535981178284,
+      "learning_rate": 4.5016666666666665e-05,
+      "loss": 1.10882568359375,
+      "step": 300
+    },
+    {
+      "epoch": 0.10333333333333333,
+      "grad_norm": 0.3984704613685608,
+      "learning_rate": 4.4850000000000006e-05,
+      "loss": 0.9269520759582519,
+      "step": 310
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.4456084966659546,
+      "learning_rate": 4.468333333333334e-05,
+      "loss": 0.9888761520385743,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8263048529624939,
+      "learning_rate": 4.451666666666667e-05,
+      "loss": 0.8790253639221192,
+      "step": 330
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 0.35195598006248474,
+      "learning_rate": 4.435e-05,
+      "loss": 0.8765983581542969,
+      "step": 340
+    },
+    {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 0.447350412607193,
+      "learning_rate": 4.4183333333333334e-05,
+      "loss": 1.1207200050354005,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7923038601875305,
+      "learning_rate": 4.401666666666667e-05,
+      "loss": 0.8299055099487305,
+      "step": 360
+    },
+    {
+      "epoch": 0.12333333333333334,
+      "grad_norm": 0.3724622428417206,
+      "learning_rate": 4.385e-05,
+      "loss": 1.1368635177612305,
+      "step": 370
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 0.49659571051597595,
+      "learning_rate": 4.3683333333333336e-05,
+      "loss": 0.9668448448181153,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.3801608681678772,
+      "learning_rate": 4.351666666666667e-05,
+      "loss": 1.0527292251586915,
+      "step": 390
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.450956791639328,
+      "learning_rate": 4.335e-05,
+      "loss": 0.9145011901855469,
+      "step": 400
+    },
+    {
+      "epoch": 0.13666666666666666,
+      "grad_norm": 0.5096069574356079,
+      "learning_rate": 4.318333333333334e-05,
+      "loss": 0.9489949226379395,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5106140971183777,
+      "learning_rate": 4.3016666666666664e-05,
+      "loss": 0.9242402076721191,
+      "step": 420
+    },
+    {
+      "epoch": 0.14333333333333334,
+      "grad_norm": 0.45267972350120544,
+      "learning_rate": 4.285e-05,
+      "loss": 0.9822881698608399,
+      "step": 430
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.6330339312553406,
+      "learning_rate": 4.268333333333334e-05,
+      "loss": 0.9491618156433106,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.39571714401245117,
+      "learning_rate": 4.251666666666667e-05,
+      "loss": 0.8421293258666992,
+      "step": 450
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 0.45525646209716797,
+      "learning_rate": 4.235e-05,
+      "loss": 0.8939360618591309,
+      "step": 460
+    },
+    {
+      "epoch": 0.15666666666666668,
+      "grad_norm": 0.4628102779388428,
+      "learning_rate": 4.218333333333333e-05,
+      "loss": 0.8926850318908691,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8717295527458191,
+      "learning_rate": 4.2016666666666674e-05,
+      "loss": 1.0624030113220215,
+      "step": 480
+    },
+    {
+      "epoch": 0.16333333333333333,
+      "grad_norm": 0.4680945873260498,
+      "learning_rate": 4.185e-05,
+      "loss": 1.0478023529052733,
+      "step": 490
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.6128882169723511,
+      "learning_rate": 4.1683333333333335e-05,
+      "loss": 0.8906542778015136,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.4829300343990326,
+      "learning_rate": 4.151666666666667e-05,
+      "loss": 0.9830364227294922,
+      "step": 510
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.38999485969543457,
+      "learning_rate": 4.135e-05,
+      "loss": 0.8852478981018066,
+      "step": 520
+    },
+    {
+      "epoch": 0.17666666666666667,
+      "grad_norm": 0.4839600622653961,
+      "learning_rate": 4.1183333333333336e-05,
+      "loss": 0.7948226451873779,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6786544919013977,
+      "learning_rate": 4.101666666666667e-05,
+      "loss": 0.8042360305786133,
+      "step": 540
+    },
+    {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 0.563444972038269,
+      "learning_rate": 4.085e-05,
+      "loss": 0.9007970809936523,
+      "step": 550
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.49744439125061035,
+      "learning_rate": 4.068333333333334e-05,
+      "loss": 0.9553793907165528,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6419633030891418,
+      "learning_rate": 4.051666666666667e-05,
+      "loss": 0.9643045425415039,
+      "step": 570
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 0.4224979281425476,
+      "learning_rate": 4.0350000000000005e-05,
+      "loss": 0.9608310699462891,
+      "step": 580
+    },
+    {
+      "epoch": 0.19666666666666666,
+      "grad_norm": 0.47567546367645264,
+      "learning_rate": 4.018333333333333e-05,
+      "loss": 0.9702249526977539,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4709877669811249,
+      "learning_rate": 4.0016666666666666e-05,
+      "loss": 1.0520934104919433,
+      "step": 600
+    },
+    {
+      "epoch": 0.20333333333333334,
+      "grad_norm": 0.49998438358306885,
+      "learning_rate": 3.9850000000000006e-05,
+      "loss": 1.0648069381713867,
+      "step": 610
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 0.5507012009620667,
+      "learning_rate": 3.9683333333333333e-05,
+      "loss": 0.9851966857910156,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.38482487201690674,
+      "learning_rate": 3.951666666666667e-05,
+      "loss": 1.1561612129211425,
+      "step": 630
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.3904467225074768,
+      "learning_rate": 3.935e-05,
+      "loss": 0.8037273406982421,
+      "step": 640
+    },
+    {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 0.38054290413856506,
+      "learning_rate": 3.9183333333333335e-05,
+      "loss": 0.9846150398254394,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5266609191894531,
+      "learning_rate": 3.901666666666667e-05,
+      "loss": 0.8768625259399414,
+      "step": 660
+    },
+    {
+      "epoch": 0.22333333333333333,
+      "grad_norm": 0.4669780135154724,
+      "learning_rate": 3.885e-05,
+      "loss": 1.0029444694519043,
+      "step": 670
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.8248076438903809,
+      "learning_rate": 3.868333333333333e-05,
+      "loss": 0.9029686927795411,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.4936239421367645,
+      "learning_rate": 3.851666666666667e-05,
+      "loss": 0.8907859802246094,
+      "step": 690
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 0.4638374447822571,
+      "learning_rate": 3.8350000000000004e-05,
+      "loss": 0.8049991607666016,
+      "step": 700
+    },
+    {
+      "epoch": 0.23666666666666666,
+      "grad_norm": 0.389417827129364,
+      "learning_rate": 3.818333333333334e-05,
+      "loss": 0.9385946273803711,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5004547834396362,
+      "learning_rate": 3.8016666666666665e-05,
+      "loss": 0.9536812782287598,
+      "step": 720
+    },
+    {
+      "epoch": 0.24333333333333335,
+      "grad_norm": 0.5362987518310547,
+      "learning_rate": 3.7850000000000005e-05,
+      "loss": 0.9482944488525391,
+      "step": 730
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 0.6165914535522461,
+      "learning_rate": 3.768333333333334e-05,
+      "loss": 0.8894567489624023,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5372249484062195,
+      "learning_rate": 3.7516666666666666e-05,
+      "loss": 0.8759191513061524,
+      "step": 750
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.5786657929420471,
+      "learning_rate": 3.735e-05,
+      "loss": 1.0180109024047852,
+      "step": 760
+    },
+    {
+      "epoch": 0.25666666666666665,
+      "grad_norm": 0.6640530228614807,
+      "learning_rate": 3.7183333333333334e-05,
+      "loss": 0.9838876724243164,
+      "step": 770
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7388576865196228,
+      "learning_rate": 3.701666666666667e-05,
+      "loss": 1.06797456741333,
+      "step": 780
+    },
+    {
+      "epoch": 0.2633333333333333,
+      "grad_norm": 0.6332851648330688,
+      "learning_rate": 3.685e-05,
+      "loss": 0.9420416831970215,
+      "step": 790
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.6151734590530396,
+      "learning_rate": 3.6683333333333335e-05,
+      "loss": 1.014689064025879,
+      "step": 800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.540749192237854,
+      "learning_rate": 3.651666666666667e-05,
+      "loss": 0.9264237403869628,
+      "step": 810
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 0.6882827877998352,
+      "learning_rate": 3.635e-05,
+      "loss": 1.0425030708312988,
+      "step": 820
+    },
+    {
+      "epoch": 0.27666666666666667,
+      "grad_norm": 0.4624575078487396,
+      "learning_rate": 3.6183333333333336e-05,
+      "loss": 0.8155969619750977,
+      "step": 830
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4114076495170593,
+      "learning_rate": 3.601666666666667e-05,
+      "loss": 1.0072894096374512,
+      "step": 840
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 1.0943950414657593,
+      "learning_rate": 3.585e-05,
+      "loss": 0.802765941619873,
+      "step": 850
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 0.5488337874412537,
+      "learning_rate": 3.568333333333334e-05,
+      "loss": 1.0031457901000977,
+      "step": 860
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.43701034784317017,
+      "learning_rate": 3.551666666666667e-05,
+      "loss": 0.8668848037719726,
+      "step": 870
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.7260711193084717,
+      "learning_rate": 3.535e-05,
+      "loss": 1.1286174774169921,
+      "step": 880
+    },
+    {
+      "epoch": 0.2966666666666667,
+      "grad_norm": 0.44215908646583557,
+      "learning_rate": 3.518333333333333e-05,
+      "loss": 0.8784223556518554,
+      "step": 890
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6829396486282349,
+      "learning_rate": 3.501666666666667e-05,
+      "loss": 0.9527795791625977,
+      "step": 900
+    },
+    {
+      "epoch": 0.30333333333333334,
+      "grad_norm": 0.5683781504631042,
+      "learning_rate": 3.485e-05,
+      "loss": 0.9348941802978515,
+      "step": 910
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.43940940499305725,
+      "learning_rate": 3.4683333333333334e-05,
+      "loss": 0.7635839939117431,
+      "step": 920
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6151530146598816,
+      "learning_rate": 3.451666666666667e-05,
+      "loss": 0.9328359603881836,
+      "step": 930
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 0.4174748957157135,
+      "learning_rate": 3.435e-05,
+      "loss": 0.8129542350769043,
+      "step": 940
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 0.4708555340766907,
+      "learning_rate": 3.4183333333333335e-05,
+      "loss": 0.8687195777893066,
+      "step": 950
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6549626588821411,
+      "learning_rate": 3.401666666666667e-05,
+      "loss": 0.8674649238586426,
+      "step": 960
+    },
+    {
+      "epoch": 0.3233333333333333,
+      "grad_norm": 0.4076550602912903,
+      "learning_rate": 3.385e-05,
+      "loss": 0.9264376640319825,
+      "step": 970
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 0.7705219984054565,
+      "learning_rate": 3.368333333333334e-05,
+      "loss": 1.1649690628051759,
+      "step": 980
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.46062591671943665,
+      "learning_rate": 3.351666666666667e-05,
+      "loss": 0.7402542114257813,
+      "step": 990
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.8681764006614685,
+      "learning_rate": 3.3350000000000004e-05,
+      "loss": 1.0566041946411133,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33666666666666667,
+      "grad_norm": 0.3948025405406952,
+      "learning_rate": 3.318333333333333e-05,
+      "loss": 0.8839986801147461,
+      "step": 1010
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5591018795967102,
+      "learning_rate": 3.3016666666666665e-05,
+      "loss": 0.9753120422363282,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3433333333333333,
+      "grad_norm": 0.40525591373443604,
+      "learning_rate": 3.2850000000000006e-05,
+      "loss": 0.7800012588500976,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.40624549984931946,
+      "learning_rate": 3.268333333333333e-05,
+      "loss": 0.9785367012023926,
+      "step": 1040
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6612586379051208,
+      "learning_rate": 3.2516666666666666e-05,
+      "loss": 0.835319995880127,
+      "step": 1050
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 0.6167810559272766,
+      "learning_rate": 3.235e-05,
+      "loss": 0.7953156948089599,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3566666666666667,
+      "grad_norm": 0.5056782960891724,
+      "learning_rate": 3.218333333333334e-05,
+      "loss": 0.9941174507141113,
+      "step": 1070
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5687284469604492,
+      "learning_rate": 3.201666666666667e-05,
+      "loss": 0.9126724243164063,
+      "step": 1080
+    },
+    {
+      "epoch": 0.36333333333333334,
+      "grad_norm": 0.446404367685318,
+      "learning_rate": 3.185e-05,
+      "loss": 0.9723684310913085,
+      "step": 1090
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 1.1806390285491943,
+      "learning_rate": 3.1683333333333335e-05,
+      "loss": 1.0675930976867676,
+      "step": 1100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.47010448575019836,
+      "learning_rate": 3.151666666666667e-05,
+      "loss": 0.8236958503723144,
+      "step": 1110
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.7065098881721497,
+      "learning_rate": 3.135e-05,
+      "loss": 0.8782394409179688,
+      "step": 1120
+    },
+    {
+      "epoch": 0.37666666666666665,
+      "grad_norm": 0.7163971066474915,
+      "learning_rate": 3.118333333333334e-05,
+      "loss": 1.0409460067749023,
+      "step": 1130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.6803985834121704,
+      "learning_rate": 3.1016666666666664e-05,
+      "loss": 0.8765417098999023,
+      "step": 1140
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 0.6493039131164551,
+      "learning_rate": 3.0850000000000004e-05,
+      "loss": 0.9212311744689942,
+      "step": 1150
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.6384336352348328,
+      "learning_rate": 3.068333333333334e-05,
+      "loss": 0.9496315002441407,
+      "step": 1160
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5862751603126526,
+      "learning_rate": 3.0516666666666665e-05,
+      "loss": 0.749457836151123,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 0.5451819896697998,
+      "learning_rate": 3.035e-05,
+      "loss": 0.9255198478698731,
+      "step": 1180
+    },
+    {
+      "epoch": 0.39666666666666667,
+      "grad_norm": 0.4223293960094452,
+      "learning_rate": 3.0183333333333336e-05,
+      "loss": 0.8668063163757325,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4768493175506592,
+      "learning_rate": 3.001666666666667e-05,
+      "loss": 0.8216365814208985,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4033333333333333,
+      "grad_norm": 0.7394158840179443,
+      "learning_rate": 2.985e-05,
+      "loss": 1.0520621299743653,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 0.5049188733100891,
+      "learning_rate": 2.9683333333333334e-05,
+      "loss": 0.9788308143615723,
+      "step": 1220
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6294832825660706,
+      "learning_rate": 2.951666666666667e-05,
+      "loss": 1.0364269256591796,
+      "step": 1230
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.6358350515365601,
+      "learning_rate": 2.935e-05,
+      "loss": 0.9813390731811523,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.512421190738678,
+      "learning_rate": 2.9183333333333336e-05,
+      "loss": 0.7307010173797608,
+      "step": 1250
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4594457447528839,
+      "learning_rate": 2.901666666666667e-05,
+      "loss": 0.8606552124023438,
+      "step": 1260
+    },
+    {
+      "epoch": 0.42333333333333334,
+      "grad_norm": 0.4652048647403717,
+      "learning_rate": 2.885e-05,
+      "loss": 0.8638803482055664,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.497286319732666,
+      "learning_rate": 2.8683333333333334e-05,
+      "loss": 0.8527148246765137,
+      "step": 1280
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.5041627883911133,
+      "learning_rate": 2.851666666666667e-05,
+      "loss": 0.7727686882019043,
+      "step": 1290
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 0.6805508136749268,
+      "learning_rate": 2.8349999999999998e-05,
+      "loss": 0.9916484832763672,
+      "step": 1300
+    },
+    {
+      "epoch": 0.43666666666666665,
+      "grad_norm": 0.4343254864215851,
+      "learning_rate": 2.8183333333333335e-05,
+      "loss": 0.8907909393310547,
+      "step": 1310
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4182907044887543,
+      "learning_rate": 2.801666666666667e-05,
+      "loss": 0.8243522644042969,
+      "step": 1320
+    },
+    {
+      "epoch": 0.44333333333333336,
+      "grad_norm": 0.5033489465713501,
+      "learning_rate": 2.7850000000000003e-05,
+      "loss": 1.0239628791809081,
+      "step": 1330
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 0.5837738513946533,
+      "learning_rate": 2.7683333333333333e-05,
+      "loss": 0.9909868240356445,
+      "step": 1340
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5943430662155151,
+      "learning_rate": 2.7516666666666667e-05,
+      "loss": 0.9027081489562988,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3687169551849365,
+      "learning_rate": 2.7350000000000004e-05,
+      "loss": 0.8732491493225097,
+      "step": 1360
+    },
+    {
+      "epoch": 0.45666666666666667,
+      "grad_norm": 0.44183358550071716,
+      "learning_rate": 2.7183333333333335e-05,
+      "loss": 0.7898604869842529,
+      "step": 1370
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.525290846824646,
+      "learning_rate": 2.701666666666667e-05,
+      "loss": 0.9487957000732422,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4633333333333333,
+      "grad_norm": 0.49439457058906555,
+      "learning_rate": 2.6850000000000002e-05,
+      "loss": 0.868436050415039,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.6398065090179443,
+      "learning_rate": 2.6683333333333333e-05,
+      "loss": 0.9199989318847657,
+      "step": 1400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.5837881565093994,
+      "learning_rate": 2.6516666666666666e-05,
+      "loss": 0.8474544525146485,
+      "step": 1410
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 0.7297168374061584,
+      "learning_rate": 2.6350000000000004e-05,
+      "loss": 1.1009994506835938,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4766666666666667,
+      "grad_norm": 0.5449320077896118,
+      "learning_rate": 2.618333333333333e-05,
+      "loss": 0.8889406204223633,
+      "step": 1430
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7387002110481262,
+      "learning_rate": 2.6016666666666668e-05,
+      "loss": 0.8909475326538085,
+      "step": 1440
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 0.5363000631332397,
+      "learning_rate": 2.585e-05,
+      "loss": 0.8837484359741211,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 0.6663982272148132,
+      "learning_rate": 2.5683333333333335e-05,
+      "loss": 0.7804791927337646,
+      "step": 1460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.5026193261146545,
+      "learning_rate": 2.5516666666666666e-05,
+      "loss": 0.9184001922607422,
+      "step": 1470
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.6870279908180237,
+      "learning_rate": 2.5350000000000003e-05,
+      "loss": 0.8195085525512695,
+      "step": 1480
+    },
+    {
+      "epoch": 0.49666666666666665,
+      "grad_norm": 0.5255699157714844,
+      "learning_rate": 2.5183333333333337e-05,
+      "loss": 0.9975809097290039,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.45492592453956604,
+      "learning_rate": 2.5016666666666667e-05,
+      "loss": 0.8944621086120605,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5033333333333333,
+      "grad_norm": 0.44872063398361206,
+      "learning_rate": 2.485e-05,
+      "loss": 0.821660041809082,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.5064987540245056,
+      "learning_rate": 2.4683333333333335e-05,
+      "loss": 1.0228797912597656,
+      "step": 1520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.434779554605484,
+      "learning_rate": 2.451666666666667e-05,
+      "loss": 0.9781878471374512,
+      "step": 1530
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 0.48390141129493713,
+      "learning_rate": 2.435e-05,
+      "loss": 0.909939193725586,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 0.41258955001831055,
+      "learning_rate": 2.4183333333333336e-05,
+      "loss": 0.8889488220214844,
+      "step": 1550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.48882147669792175,
+      "learning_rate": 2.4016666666666667e-05,
+      "loss": 0.8874250411987304,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5233333333333333,
+      "grad_norm": 0.4496597647666931,
+      "learning_rate": 2.385e-05,
+      "loss": 0.866064453125,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 0.5498498678207397,
+      "learning_rate": 2.3683333333333334e-05,
+      "loss": 1.0485063552856446,
+      "step": 1580
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.5302222967147827,
+      "learning_rate": 2.3516666666666668e-05,
+      "loss": 0.79019775390625,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.6240465641021729,
+      "learning_rate": 2.3350000000000002e-05,
+      "loss": 0.8068696975708007,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5366666666666666,
+      "grad_norm": 0.49114975333213806,
+      "learning_rate": 2.3183333333333336e-05,
+      "loss": 0.9069293975830078,
+      "step": 1610
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.558907687664032,
+      "learning_rate": 2.3016666666666666e-05,
+      "loss": 0.7732550144195557,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5433333333333333,
+      "grad_norm": 0.7307827472686768,
+      "learning_rate": 2.2850000000000003e-05,
+      "loss": 0.8363723754882812,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.5290479063987732,
+      "learning_rate": 2.2683333333333334e-05,
+      "loss": 0.8145934104919433,
+      "step": 1640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.5289633870124817,
+      "learning_rate": 2.2516666666666667e-05,
+      "loss": 0.8467626571655273,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 0.42270180583000183,
+      "learning_rate": 2.235e-05,
+      "loss": 1.0054572105407715,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5566666666666666,
+      "grad_norm": 0.46330273151397705,
+      "learning_rate": 2.2183333333333335e-05,
+      "loss": 0.8821262359619141,
+      "step": 1670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.54402756690979,
+      "learning_rate": 2.201666666666667e-05,
+      "loss": 0.8570803642272949,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5633333333333334,
+      "grad_norm": 0.4660607576370239,
+      "learning_rate": 2.1850000000000003e-05,
+      "loss": 0.923713493347168,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 0.43630075454711914,
+      "learning_rate": 2.1683333333333333e-05,
+      "loss": 0.8405223846435547,
+      "step": 1700
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.48713231086730957,
+      "learning_rate": 2.1516666666666667e-05,
+      "loss": 1.0100667953491211,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5924938321113586,
+      "learning_rate": 2.135e-05,
+      "loss": 0.9960016250610352,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5766666666666667,
+      "grad_norm": 0.5111542344093323,
+      "learning_rate": 2.1183333333333334e-05,
+      "loss": 0.89755220413208,
+      "step": 1730
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4552167057991028,
+      "learning_rate": 2.1016666666666668e-05,
+      "loss": 0.8854806900024415,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.554473876953125,
+      "learning_rate": 2.085e-05,
+      "loss": 0.7076638698577881,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.5026177167892456,
+      "learning_rate": 2.0683333333333336e-05,
+      "loss": 0.9850486755371094,
+      "step": 1760
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5753727555274963,
+      "learning_rate": 2.0516666666666666e-05,
+      "loss": 0.9340484619140625,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 0.5122212171554565,
+      "learning_rate": 2.035e-05,
+      "loss": 0.9032992362976074,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5966666666666667,
+      "grad_norm": 0.5792819857597351,
+      "learning_rate": 2.0183333333333334e-05,
+      "loss": 0.8477163314819336,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5964590907096863,
+      "learning_rate": 2.0016666666666668e-05,
+      "loss": 0.9166988372802735,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6033333333333334,
+      "grad_norm": 0.7182376980781555,
+      "learning_rate": 1.985e-05,
+      "loss": 0.8029914855957031,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 0.3775170147418976,
+      "learning_rate": 1.9683333333333335e-05,
+      "loss": 0.8039090156555175,
+      "step": 1820
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.3779233694076538,
+      "learning_rate": 1.9516666666666666e-05,
+      "loss": 0.7587248802185058,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.529349684715271,
+      "learning_rate": 1.9350000000000003e-05,
+      "loss": 0.9525286674499511,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 0.8576200604438782,
+      "learning_rate": 1.9183333333333333e-05,
+      "loss": 0.7803131103515625,
+      "step": 1850
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.5265026092529297,
+      "learning_rate": 1.901666666666667e-05,
+      "loss": 0.9432580947875977,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6233333333333333,
+      "grad_norm": 0.43818992376327515,
+      "learning_rate": 1.885e-05,
+      "loss": 0.8160367012023926,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.5307653546333313,
+      "learning_rate": 1.8683333333333335e-05,
+      "loss": 0.7481701850891114,
+      "step": 1880
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.4350138008594513,
+      "learning_rate": 1.851666666666667e-05,
+      "loss": 0.9646284103393554,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 0.7320852279663086,
+      "learning_rate": 1.8350000000000002e-05,
+      "loss": 0.964715576171875,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6366666666666667,
+      "grad_norm": 0.39258873462677,
+      "learning_rate": 1.8183333333333336e-05,
+      "loss": 0.8884981155395508,
+      "step": 1910
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3995574116706848,
+      "learning_rate": 1.8016666666666666e-05,
+      "loss": 0.8017918586730957,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6433333333333333,
+      "grad_norm": 0.5143277049064636,
+      "learning_rate": 1.785e-05,
+      "loss": 0.9140171051025391,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 0.4470940828323364,
+      "learning_rate": 1.7683333333333334e-05,
+      "loss": 0.9137473106384277,
+      "step": 1940
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.4530799388885498,
+      "learning_rate": 1.7516666666666668e-05,
+      "loss": 0.9416312217712403,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.5486093163490295,
+      "learning_rate": 1.7349999999999998e-05,
+      "loss": 0.9309564590454101,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6566666666666666,
+      "grad_norm": 0.7241241931915283,
+      "learning_rate": 1.7183333333333335e-05,
+      "loss": 0.8793378829956054,
+      "step": 1970
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.48900172114372253,
+      "learning_rate": 1.7016666666666666e-05,
+      "loss": 0.9930448532104492,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6633333333333333,
+      "grad_norm": 0.5354626178741455,
+      "learning_rate": 1.6850000000000003e-05,
+      "loss": 0.8400119781494141,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5346295237541199,
+      "learning_rate": 1.6683333333333333e-05,
+      "loss": 0.7424459457397461,
+      "step": 2000
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7362031936645508,
+      "learning_rate": 1.6516666666666667e-05,
+      "loss": 0.836764907836914,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6733333333333333,
+      "grad_norm": 0.8799192905426025,
+      "learning_rate": 1.635e-05,
+      "loss": 0.9453885078430175,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6766666666666666,
+      "grad_norm": 0.5263342261314392,
+      "learning_rate": 1.6183333333333335e-05,
+      "loss": 0.7978546142578125,
+      "step": 2030
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6986008286476135,
+      "learning_rate": 1.601666666666667e-05,
+      "loss": 0.8797599792480468,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 0.7081782221794128,
+      "learning_rate": 1.5850000000000002e-05,
+      "loss": 1.0752653121948241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6866666666666666,
+      "grad_norm": 0.5002477765083313,
+      "learning_rate": 1.5683333333333333e-05,
+      "loss": 0.7971479892730713,
+      "step": 2060
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.4529975354671478,
+      "learning_rate": 1.5516666666666667e-05,
+      "loss": 0.912747859954834,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.48189014196395874,
+      "learning_rate": 1.535e-05,
+      "loss": 1.0259061813354493,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6966666666666667,
+      "grad_norm": 0.6560697555541992,
+      "learning_rate": 1.5183333333333333e-05,
+      "loss": 1.0107606887817382,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.722689151763916,
+      "learning_rate": 1.5016666666666668e-05,
+      "loss": 0.962984848022461,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7033333333333334,
+      "grad_norm": 1.0071346759796143,
+      "learning_rate": 1.485e-05,
+      "loss": 0.9162399291992187,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.5007173418998718,
+      "learning_rate": 1.4683333333333336e-05,
+      "loss": 0.8683804512023926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.4455113708972931,
+      "learning_rate": 1.4516666666666668e-05,
+      "loss": 0.929558277130127,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7133333333333334,
+      "grad_norm": 0.5244899392127991,
+      "learning_rate": 1.435e-05,
+      "loss": 0.7725494861602783,
+      "step": 2140
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 0.5691429376602173,
+      "learning_rate": 1.4183333333333335e-05,
+      "loss": 0.9669612884521485,
+      "step": 2150
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5419687032699585,
+      "learning_rate": 1.4016666666666667e-05,
+      "loss": 0.9267525672912598,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7233333333333334,
+      "grad_norm": 0.9937120079994202,
+      "learning_rate": 1.3850000000000001e-05,
+      "loss": 0.8306878089904786,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7266666666666667,
+      "grad_norm": 0.4639163017272949,
+      "learning_rate": 1.3683333333333333e-05,
+      "loss": 0.8372581481933594,
+      "step": 2180
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.45630142092704773,
+      "learning_rate": 1.3516666666666667e-05,
+      "loss": 0.8593014717102051,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4326620399951935,
+      "learning_rate": 1.3350000000000001e-05,
+      "loss": 0.8103547096252441,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7366666666666667,
+      "grad_norm": 0.48703309893608093,
+      "learning_rate": 1.3183333333333333e-05,
+      "loss": 0.8958615303039551,
+      "step": 2210
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5286509394645691,
+      "learning_rate": 1.3016666666666669e-05,
+      "loss": 0.9600817680358886,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7433333333333333,
+      "grad_norm": 0.6584081649780273,
+      "learning_rate": 1.285e-05,
+      "loss": 1.0164281845092773,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.5143536925315857,
+      "learning_rate": 1.2683333333333333e-05,
+      "loss": 1.0931424140930175,
+      "step": 2240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9453914165496826,
+      "learning_rate": 1.2516666666666668e-05,
+      "loss": 0.7825816154479981,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7533333333333333,
+      "grad_norm": 0.48963093757629395,
+      "learning_rate": 1.235e-05,
+      "loss": 0.9285711288452149,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7566666666666667,
+      "grad_norm": 0.5854438543319702,
+      "learning_rate": 1.2183333333333334e-05,
+      "loss": 0.8443680763244629,
+      "step": 2270
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.49953049421310425,
+      "learning_rate": 1.2016666666666668e-05,
+      "loss": 0.8316192626953125,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7633333333333333,
+      "grad_norm": 0.6657202839851379,
+      "learning_rate": 1.185e-05,
+      "loss": 0.910405158996582,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 0.6646652221679688,
+      "learning_rate": 1.1683333333333334e-05,
+      "loss": 0.8283540725708007,
+      "step": 2300
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.49376705288887024,
+      "learning_rate": 1.1516666666666668e-05,
+      "loss": 0.8653836250305176,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.5468245148658752,
+      "learning_rate": 1.1350000000000001e-05,
+      "loss": 0.8328197479248047,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7766666666666666,
+      "grad_norm": 0.8111145496368408,
+      "learning_rate": 1.1183333333333335e-05,
+      "loss": 0.9046418190002441,
+      "step": 2330
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5041958093643188,
+      "learning_rate": 1.1016666666666667e-05,
+      "loss": 0.8311249732971191,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 0.4898006021976471,
+      "learning_rate": 1.0850000000000001e-05,
+      "loss": 0.9336203575134278,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.4351266324520111,
+      "learning_rate": 1.0683333333333333e-05,
+      "loss": 0.9776251792907715,
+      "step": 2360
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.579655647277832,
+      "learning_rate": 1.0516666666666667e-05,
+      "loss": 0.7065846443176269,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7933333333333333,
+      "grad_norm": 0.4177819788455963,
+      "learning_rate": 1.035e-05,
+      "loss": 0.7918330669403076,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7966666666666666,
+      "grad_norm": 0.4987991750240326,
+      "learning_rate": 1.0183333333333333e-05,
+      "loss": 0.8270879745483398,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6702345013618469,
+      "learning_rate": 1.0016666666666667e-05,
+      "loss": 0.7966389656066895,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8033333333333333,
+      "grad_norm": 0.684005618095398,
+      "learning_rate": 9.85e-06,
+      "loss": 0.8869472503662109,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8066666666666666,
+      "grad_norm": 0.4468795657157898,
+      "learning_rate": 9.683333333333333e-06,
+      "loss": 0.9556525230407715,
+      "step": 2420
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5046238303184509,
+      "learning_rate": 9.516666666666666e-06,
+      "loss": 0.8056395530700684,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.6720165610313416,
+      "learning_rate": 9.35e-06,
+      "loss": 0.8834376335144043,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 0.4289397597312927,
+      "learning_rate": 9.183333333333334e-06,
+      "loss": 0.7789588928222656,
+      "step": 2450
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6209238171577454,
+      "learning_rate": 9.016666666666668e-06,
+      "loss": 0.8701201438903808,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8233333333333334,
+      "grad_norm": 0.44446897506713867,
+      "learning_rate": 8.85e-06,
+      "loss": 0.7950375080108643,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.44629836082458496,
+      "learning_rate": 8.683333333333334e-06,
+      "loss": 0.7967105865478515,
+      "step": 2480
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.37678250670433044,
+      "learning_rate": 8.516666666666668e-06,
+      "loss": 0.771687650680542,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.494236558675766,
+      "learning_rate": 8.350000000000001e-06,
+      "loss": 0.8968353271484375,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8366666666666667,
+      "grad_norm": 0.6953932046890259,
+      "learning_rate": 8.183333333333333e-06,
+      "loss": 1.0087746620178222,
+      "step": 2510
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.488052099943161,
+      "learning_rate": 8.016666666666667e-06,
+      "loss": 0.9487748146057129,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8433333333333334,
+      "grad_norm": 0.7786927223205566,
+      "learning_rate": 7.850000000000001e-06,
+      "loss": 0.9541014671325684,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8466666666666667,
+      "grad_norm": 0.5057625770568848,
+      "learning_rate": 7.683333333333335e-06,
+      "loss": 0.905206298828125,
+      "step": 2540
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7162487506866455,
+      "learning_rate": 7.516666666666668e-06,
+      "loss": 0.9245425224304199,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.5543293952941895,
+      "learning_rate": 7.35e-06,
+      "loss": 0.8062684059143066,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8566666666666667,
+      "grad_norm": 0.7101190686225891,
+      "learning_rate": 7.183333333333334e-06,
+      "loss": 0.9243124961853028,
+      "step": 2570
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.726828932762146,
+      "learning_rate": 7.0166666666666675e-06,
+      "loss": 0.7988007068634033,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8633333333333333,
+      "grad_norm": 0.533573567867279,
+      "learning_rate": 6.8500000000000005e-06,
+      "loss": 0.8835041046142578,
+      "step": 2590
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.5849156975746155,
+      "learning_rate": 6.6833333333333334e-06,
+      "loss": 0.8487396240234375,
+      "step": 2600
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8949032425880432,
+      "learning_rate": 6.516666666666666e-06,
+      "loss": 0.8319039344787598,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8733333333333333,
+      "grad_norm": 0.6677828431129456,
+      "learning_rate": 6.35e-06,
+      "loss": 0.9017569541931152,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8766666666666667,
+      "grad_norm": 0.5268790125846863,
+      "learning_rate": 6.183333333333333e-06,
+      "loss": 0.7499767780303955,
+      "step": 2630
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6157758235931396,
+      "learning_rate": 6.016666666666667e-06,
+      "loss": 0.8814908027648926,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8833333333333333,
+      "grad_norm": 0.5197092890739441,
+      "learning_rate": 5.850000000000001e-06,
+      "loss": 0.8590426445007324,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8866666666666667,
+      "grad_norm": 0.7915957570075989,
+      "learning_rate": 5.683333333333334e-06,
+      "loss": 0.8466612815856933,
+      "step": 2660
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.48902902007102966,
+      "learning_rate": 5.5166666666666675e-06,
+      "loss": 0.9879349708557129,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.5069965124130249,
+      "learning_rate": 5.3500000000000004e-06,
+      "loss": 0.8216916084289551,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8966666666666666,
+      "grad_norm": 0.6190909743309021,
+      "learning_rate": 5.183333333333333e-06,
+      "loss": 0.8995295524597168,
+      "step": 2690
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.45372089743614197,
+      "learning_rate": 5.016666666666666e-06,
+      "loss": 0.8368668556213379,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9033333333333333,
+      "grad_norm": 0.5643200278282166,
+      "learning_rate": 4.85e-06,
+      "loss": 0.7782045364379883,
+      "step": 2710
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.5345107913017273,
+      "learning_rate": 4.683333333333333e-06,
+      "loss": 0.8649769783020019,
+      "step": 2720
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6174563765525818,
+      "learning_rate": 4.516666666666667e-06,
+      "loss": 0.890013313293457,
+      "step": 2730
+    },
+    {
+      "epoch": 0.9133333333333333,
+      "grad_norm": 0.6632450222969055,
+      "learning_rate": 4.35e-06,
+      "loss": 0.8954425811767578,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.5862364768981934,
+      "learning_rate": 4.183333333333334e-06,
+      "loss": 0.9733158111572265,
+      "step": 2750
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.625480592250824,
+      "learning_rate": 4.0166666666666675e-06,
+      "loss": 0.940821647644043,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9233333333333333,
+      "grad_norm": 0.5125066041946411,
+      "learning_rate": 3.85e-06,
+      "loss": 0.8799821853637695,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9266666666666666,
+      "grad_norm": 0.501059353351593,
+      "learning_rate": 3.6833333333333338e-06,
+      "loss": 0.8071253776550293,
+      "step": 2780
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.480802446603775,
+      "learning_rate": 3.5166666666666667e-06,
+      "loss": 0.7463678359985352,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.5736676454544067,
+      "learning_rate": 3.3500000000000005e-06,
+      "loss": 0.9327493667602539,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9366666666666666,
+      "grad_norm": 0.4975570738315582,
+      "learning_rate": 3.1833333333333335e-06,
+      "loss": 0.8904853820800781,
+      "step": 2810
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.480354368686676,
+      "learning_rate": 3.016666666666667e-06,
+      "loss": 1.0292061805725097,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9433333333333334,
+      "grad_norm": 0.6452360153198242,
+      "learning_rate": 2.8500000000000002e-06,
+      "loss": 1.0092354774475099,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.5015031099319458,
+      "learning_rate": 2.6833333333333336e-06,
+      "loss": 1.0683047294616699,
+      "step": 2840
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.4239721894264221,
+      "learning_rate": 2.516666666666667e-06,
+      "loss": 0.8794116973876953,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9533333333333334,
+      "grad_norm": 0.5949695110321045,
+      "learning_rate": 2.35e-06,
+      "loss": 1.0180916786193848,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9566666666666667,
+      "grad_norm": 0.551426112651825,
+      "learning_rate": 2.1833333333333333e-06,
+      "loss": 0.9177707672119141,
+      "step": 2870
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6483604907989502,
+      "learning_rate": 2.0166666666666667e-06,
+      "loss": 0.9020861625671387,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9633333333333334,
+      "grad_norm": 0.60732501745224,
+      "learning_rate": 1.85e-06,
+      "loss": 1.0001092910766602,
+      "step": 2890
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 0.46931129693984985,
+      "learning_rate": 1.6833333333333332e-06,
+      "loss": 0.8657818794250488,
+      "step": 2900
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.5216684937477112,
+      "learning_rate": 1.5166666666666668e-06,
+      "loss": 0.9077080726623535,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.547545313835144,
+      "learning_rate": 1.35e-06,
+      "loss": 0.8793766021728515,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9766666666666667,
+      "grad_norm": 0.4495963752269745,
+      "learning_rate": 1.1833333333333334e-06,
+      "loss": 0.8557974815368652,
+      "step": 2930
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6442372798919678,
+      "learning_rate": 1.0166666666666665e-06,
+      "loss": 0.8119054794311523,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9833333333333333,
+      "grad_norm": 0.7756669521331787,
+      "learning_rate": 8.500000000000001e-07,
+      "loss": 0.8289030075073243,
+      "step": 2950
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 792192378470400.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2950/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b8cdeeb02b683888c58950ba20634617e28bd81b336c3f037116cc9305a2043
+size 5137

checkpoint-3000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: gpt2
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:gpt2
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-3000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "gpt2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": true,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_attn"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-3000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daf96ecad4fb2c407b82f85d56cef90e2882f3ad4e8696d095a21353f66fa596
+size 9440280

checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deff9b9dd7ef550fba4b599fd0cd38fbb926067da66a651ae86dcc827221dea0
+size 4824013

checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8b94bd241ca409ffd83d810e42f2395e89e79fad149c049cbb96ddedee131a7
+size 14645

checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7801c757c2fe5906123b9cd33a124fb76c352678272072d20002c052fe674b7d
+size 1465

checkpoint-3000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "is_local": false,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2134 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0033333333333333335,
+      "grad_norm": 6.809678077697754,
+      "learning_rate": 4.9850000000000006e-05,
+      "loss": 7.233219146728516,
+      "step": 10
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 8.315083503723145,
+      "learning_rate": 4.968333333333334e-05,
+      "loss": 5.7482860565185545,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 6.454437255859375,
+      "learning_rate": 4.9516666666666666e-05,
+      "loss": 3.6689868927001954,
+      "step": 30
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 3.284022092819214,
+      "learning_rate": 4.935e-05,
+      "loss": 2.273424530029297,
+      "step": 40
+    },
+    {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 0.9964243173599243,
+      "learning_rate": 4.9183333333333334e-05,
+      "loss": 1.6669052124023438,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6209345459938049,
+      "learning_rate": 4.901666666666667e-05,
+      "loss": 1.347894287109375,
+      "step": 60
+    },
+    {
+      "epoch": 0.023333333333333334,
+      "grad_norm": 0.7051726579666138,
+      "learning_rate": 4.885e-05,
+      "loss": 1.3426547050476074,
+      "step": 70
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5769520998001099,
+      "learning_rate": 4.8683333333333335e-05,
+      "loss": 1.031618881225586,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.3832005262374878,
+      "learning_rate": 4.851666666666667e-05,
+      "loss": 1.0876376152038574,
+      "step": 90
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 0.423240602016449,
+      "learning_rate": 4.835e-05,
+      "loss": 0.967049503326416,
+      "step": 100
+    },
+    {
+      "epoch": 0.03666666666666667,
+      "grad_norm": 1.0171490907669067,
+      "learning_rate": 4.818333333333334e-05,
+      "loss": 1.0151338577270508,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.42430293560028076,
+      "learning_rate": 4.801666666666667e-05,
+      "loss": 1.0364399909973145,
+      "step": 120
+    },
+    {
+      "epoch": 0.043333333333333335,
+      "grad_norm": 1.113786220550537,
+      "learning_rate": 4.785e-05,
+      "loss": 1.2358755111694335,
+      "step": 130
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 0.4222952425479889,
+      "learning_rate": 4.768333333333334e-05,
+      "loss": 0.9435253143310547,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.4197883903980255,
+      "learning_rate": 4.751666666666667e-05,
+      "loss": 0.9088167190551758,
+      "step": 150
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4791605770587921,
+      "learning_rate": 4.735e-05,
+      "loss": 1.0109647750854491,
+      "step": 160
+    },
+    {
+      "epoch": 0.056666666666666664,
+      "grad_norm": 0.4866371154785156,
+      "learning_rate": 4.718333333333333e-05,
+      "loss": 1.0510098457336425,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.369156152009964,
+      "learning_rate": 4.701666666666667e-05,
+      "loss": 0.9953393936157227,
+      "step": 180
+    },
+    {
+      "epoch": 0.06333333333333334,
+      "grad_norm": 0.8538883924484253,
+      "learning_rate": 4.685000000000001e-05,
+      "loss": 1.0944287300109863,
+      "step": 190
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4323364496231079,
+      "learning_rate": 4.6683333333333334e-05,
+      "loss": 0.9442741394042968,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5124598145484924,
+      "learning_rate": 4.651666666666667e-05,
+      "loss": 0.8857268333435059,
+      "step": 210
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 0.5820016264915466,
+      "learning_rate": 4.635e-05,
+      "loss": 1.1653939247131349,
+      "step": 220
+    },
+    {
+      "epoch": 0.07666666666666666,
+      "grad_norm": 0.3909580111503601,
+      "learning_rate": 4.6183333333333336e-05,
+      "loss": 1.0361078262329102,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9888875484466553,
+      "learning_rate": 4.601666666666667e-05,
+      "loss": 1.1712039947509765,
+      "step": 240
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.5135601758956909,
+      "learning_rate": 4.585e-05,
+      "loss": 0.9313525199890137,
+      "step": 250
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 0.5219751000404358,
+      "learning_rate": 4.568333333333333e-05,
+      "loss": 1.0194238662719726,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5085497498512268,
+      "learning_rate": 4.551666666666667e-05,
+      "loss": 0.9100503921508789,
+      "step": 270
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.39039579033851624,
+      "learning_rate": 4.5350000000000005e-05,
+      "loss": 0.9138201713562012,
+      "step": 280
+    },
+    {
+      "epoch": 0.09666666666666666,
+      "grad_norm": 0.4246252179145813,
+      "learning_rate": 4.518333333333333e-05,
+      "loss": 1.0014129638671876,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.46258535981178284,
+      "learning_rate": 4.5016666666666665e-05,
+      "loss": 1.10882568359375,
+      "step": 300
+    },
+    {
+      "epoch": 0.10333333333333333,
+      "grad_norm": 0.3984704613685608,
+      "learning_rate": 4.4850000000000006e-05,
+      "loss": 0.9269520759582519,
+      "step": 310
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.4456084966659546,
+      "learning_rate": 4.468333333333334e-05,
+      "loss": 0.9888761520385743,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8263048529624939,
+      "learning_rate": 4.451666666666667e-05,
+      "loss": 0.8790253639221192,
+      "step": 330
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 0.35195598006248474,
+      "learning_rate": 4.435e-05,
+      "loss": 0.8765983581542969,
+      "step": 340
+    },
+    {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 0.447350412607193,
+      "learning_rate": 4.4183333333333334e-05,
+      "loss": 1.1207200050354005,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7923038601875305,
+      "learning_rate": 4.401666666666667e-05,
+      "loss": 0.8299055099487305,
+      "step": 360
+    },
+    {
+      "epoch": 0.12333333333333334,
+      "grad_norm": 0.3724622428417206,
+      "learning_rate": 4.385e-05,
+      "loss": 1.1368635177612305,
+      "step": 370
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 0.49659571051597595,
+      "learning_rate": 4.3683333333333336e-05,
+      "loss": 0.9668448448181153,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.3801608681678772,
+      "learning_rate": 4.351666666666667e-05,
+      "loss": 1.0527292251586915,
+      "step": 390
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.450956791639328,
+      "learning_rate": 4.335e-05,
+      "loss": 0.9145011901855469,
+      "step": 400
+    },
+    {
+      "epoch": 0.13666666666666666,
+      "grad_norm": 0.5096069574356079,
+      "learning_rate": 4.318333333333334e-05,
+      "loss": 0.9489949226379395,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5106140971183777,
+      "learning_rate": 4.3016666666666664e-05,
+      "loss": 0.9242402076721191,
+      "step": 420
+    },
+    {
+      "epoch": 0.14333333333333334,
+      "grad_norm": 0.45267972350120544,
+      "learning_rate": 4.285e-05,
+      "loss": 0.9822881698608399,
+      "step": 430
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.6330339312553406,
+      "learning_rate": 4.268333333333334e-05,
+      "loss": 0.9491618156433106,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.39571714401245117,
+      "learning_rate": 4.251666666666667e-05,
+      "loss": 0.8421293258666992,
+      "step": 450
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 0.45525646209716797,
+      "learning_rate": 4.235e-05,
+      "loss": 0.8939360618591309,
+      "step": 460
+    },
+    {
+      "epoch": 0.15666666666666668,
+      "grad_norm": 0.4628102779388428,
+      "learning_rate": 4.218333333333333e-05,
+      "loss": 0.8926850318908691,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8717295527458191,
+      "learning_rate": 4.2016666666666674e-05,
+      "loss": 1.0624030113220215,
+      "step": 480
+    },
+    {
+      "epoch": 0.16333333333333333,
+      "grad_norm": 0.4680945873260498,
+      "learning_rate": 4.185e-05,
+      "loss": 1.0478023529052733,
+      "step": 490
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.6128882169723511,
+      "learning_rate": 4.1683333333333335e-05,
+      "loss": 0.8906542778015136,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.4829300343990326,
+      "learning_rate": 4.151666666666667e-05,
+      "loss": 0.9830364227294922,
+      "step": 510
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.38999485969543457,
+      "learning_rate": 4.135e-05,
+      "loss": 0.8852478981018066,
+      "step": 520
+    },
+    {
+      "epoch": 0.17666666666666667,
+      "grad_norm": 0.4839600622653961,
+      "learning_rate": 4.1183333333333336e-05,
+      "loss": 0.7948226451873779,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6786544919013977,
+      "learning_rate": 4.101666666666667e-05,
+      "loss": 0.8042360305786133,
+      "step": 540
+    },
+    {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 0.563444972038269,
+      "learning_rate": 4.085e-05,
+      "loss": 0.9007970809936523,
+      "step": 550
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.49744439125061035,
+      "learning_rate": 4.068333333333334e-05,
+      "loss": 0.9553793907165528,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6419633030891418,
+      "learning_rate": 4.051666666666667e-05,
+      "loss": 0.9643045425415039,
+      "step": 570
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 0.4224979281425476,
+      "learning_rate": 4.0350000000000005e-05,
+      "loss": 0.9608310699462891,
+      "step": 580
+    },
+    {
+      "epoch": 0.19666666666666666,
+      "grad_norm": 0.47567546367645264,
+      "learning_rate": 4.018333333333333e-05,
+      "loss": 0.9702249526977539,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4709877669811249,
+      "learning_rate": 4.0016666666666666e-05,
+      "loss": 1.0520934104919433,
+      "step": 600
+    },
+    {
+      "epoch": 0.20333333333333334,
+      "grad_norm": 0.49998438358306885,
+      "learning_rate": 3.9850000000000006e-05,
+      "loss": 1.0648069381713867,
+      "step": 610
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 0.5507012009620667,
+      "learning_rate": 3.9683333333333333e-05,
+      "loss": 0.9851966857910156,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.38482487201690674,
+      "learning_rate": 3.951666666666667e-05,
+      "loss": 1.1561612129211425,
+      "step": 630
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.3904467225074768,
+      "learning_rate": 3.935e-05,
+      "loss": 0.8037273406982421,
+      "step": 640
+    },
+    {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 0.38054290413856506,
+      "learning_rate": 3.9183333333333335e-05,
+      "loss": 0.9846150398254394,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5266609191894531,
+      "learning_rate": 3.901666666666667e-05,
+      "loss": 0.8768625259399414,
+      "step": 660
+    },
+    {
+      "epoch": 0.22333333333333333,
+      "grad_norm": 0.4669780135154724,
+      "learning_rate": 3.885e-05,
+      "loss": 1.0029444694519043,
+      "step": 670
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.8248076438903809,
+      "learning_rate": 3.868333333333333e-05,
+      "loss": 0.9029686927795411,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.4936239421367645,
+      "learning_rate": 3.851666666666667e-05,
+      "loss": 0.8907859802246094,
+      "step": 690
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 0.4638374447822571,
+      "learning_rate": 3.8350000000000004e-05,
+      "loss": 0.8049991607666016,
+      "step": 700
+    },
+    {
+      "epoch": 0.23666666666666666,
+      "grad_norm": 0.389417827129364,
+      "learning_rate": 3.818333333333334e-05,
+      "loss": 0.9385946273803711,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5004547834396362,
+      "learning_rate": 3.8016666666666665e-05,
+      "loss": 0.9536812782287598,
+      "step": 720
+    },
+    {
+      "epoch": 0.24333333333333335,
+      "grad_norm": 0.5362987518310547,
+      "learning_rate": 3.7850000000000005e-05,
+      "loss": 0.9482944488525391,
+      "step": 730
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 0.6165914535522461,
+      "learning_rate": 3.768333333333334e-05,
+      "loss": 0.8894567489624023,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5372249484062195,
+      "learning_rate": 3.7516666666666666e-05,
+      "loss": 0.8759191513061524,
+      "step": 750
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.5786657929420471,
+      "learning_rate": 3.735e-05,
+      "loss": 1.0180109024047852,
+      "step": 760
+    },
+    {
+      "epoch": 0.25666666666666665,
+      "grad_norm": 0.6640530228614807,
+      "learning_rate": 3.7183333333333334e-05,
+      "loss": 0.9838876724243164,
+      "step": 770
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7388576865196228,
+      "learning_rate": 3.701666666666667e-05,
+      "loss": 1.06797456741333,
+      "step": 780
+    },
+    {
+      "epoch": 0.2633333333333333,
+      "grad_norm": 0.6332851648330688,
+      "learning_rate": 3.685e-05,
+      "loss": 0.9420416831970215,
+      "step": 790
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.6151734590530396,
+      "learning_rate": 3.6683333333333335e-05,
+      "loss": 1.014689064025879,
+      "step": 800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.540749192237854,
+      "learning_rate": 3.651666666666667e-05,
+      "loss": 0.9264237403869628,
+      "step": 810
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 0.6882827877998352,
+      "learning_rate": 3.635e-05,
+      "loss": 1.0425030708312988,
+      "step": 820
+    },
+    {
+      "epoch": 0.27666666666666667,
+      "grad_norm": 0.4624575078487396,
+      "learning_rate": 3.6183333333333336e-05,
+      "loss": 0.8155969619750977,
+      "step": 830
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4114076495170593,
+      "learning_rate": 3.601666666666667e-05,
+      "loss": 1.0072894096374512,
+      "step": 840
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 1.0943950414657593,
+      "learning_rate": 3.585e-05,
+      "loss": 0.802765941619873,
+      "step": 850
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 0.5488337874412537,
+      "learning_rate": 3.568333333333334e-05,
+      "loss": 1.0031457901000977,
+      "step": 860
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.43701034784317017,
+      "learning_rate": 3.551666666666667e-05,
+      "loss": 0.8668848037719726,
+      "step": 870
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.7260711193084717,
+      "learning_rate": 3.535e-05,
+      "loss": 1.1286174774169921,
+      "step": 880
+    },
+    {
+      "epoch": 0.2966666666666667,
+      "grad_norm": 0.44215908646583557,
+      "learning_rate": 3.518333333333333e-05,
+      "loss": 0.8784223556518554,
+      "step": 890
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6829396486282349,
+      "learning_rate": 3.501666666666667e-05,
+      "loss": 0.9527795791625977,
+      "step": 900
+    },
+    {
+      "epoch": 0.30333333333333334,
+      "grad_norm": 0.5683781504631042,
+      "learning_rate": 3.485e-05,
+      "loss": 0.9348941802978515,
+      "step": 910
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.43940940499305725,
+      "learning_rate": 3.4683333333333334e-05,
+      "loss": 0.7635839939117431,
+      "step": 920
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6151530146598816,
+      "learning_rate": 3.451666666666667e-05,
+      "loss": 0.9328359603881836,
+      "step": 930
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 0.4174748957157135,
+      "learning_rate": 3.435e-05,
+      "loss": 0.8129542350769043,
+      "step": 940
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 0.4708555340766907,
+      "learning_rate": 3.4183333333333335e-05,
+      "loss": 0.8687195777893066,
+      "step": 950
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6549626588821411,
+      "learning_rate": 3.401666666666667e-05,
+      "loss": 0.8674649238586426,
+      "step": 960
+    },
+    {
+      "epoch": 0.3233333333333333,
+      "grad_norm": 0.4076550602912903,
+      "learning_rate": 3.385e-05,
+      "loss": 0.9264376640319825,
+      "step": 970
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 0.7705219984054565,
+      "learning_rate": 3.368333333333334e-05,
+      "loss": 1.1649690628051759,
+      "step": 980
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.46062591671943665,
+      "learning_rate": 3.351666666666667e-05,
+      "loss": 0.7402542114257813,
+      "step": 990
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.8681764006614685,
+      "learning_rate": 3.3350000000000004e-05,
+      "loss": 1.0566041946411133,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33666666666666667,
+      "grad_norm": 0.3948025405406952,
+      "learning_rate": 3.318333333333333e-05,
+      "loss": 0.8839986801147461,
+      "step": 1010
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5591018795967102,
+      "learning_rate": 3.3016666666666665e-05,
+      "loss": 0.9753120422363282,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3433333333333333,
+      "grad_norm": 0.40525591373443604,
+      "learning_rate": 3.2850000000000006e-05,
+      "loss": 0.7800012588500976,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.40624549984931946,
+      "learning_rate": 3.268333333333333e-05,
+      "loss": 0.9785367012023926,
+      "step": 1040
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6612586379051208,
+      "learning_rate": 3.2516666666666666e-05,
+      "loss": 0.835319995880127,
+      "step": 1050
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 0.6167810559272766,
+      "learning_rate": 3.235e-05,
+      "loss": 0.7953156948089599,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3566666666666667,
+      "grad_norm": 0.5056782960891724,
+      "learning_rate": 3.218333333333334e-05,
+      "loss": 0.9941174507141113,
+      "step": 1070
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5687284469604492,
+      "learning_rate": 3.201666666666667e-05,
+      "loss": 0.9126724243164063,
+      "step": 1080
+    },
+    {
+      "epoch": 0.36333333333333334,
+      "grad_norm": 0.446404367685318,
+      "learning_rate": 3.185e-05,
+      "loss": 0.9723684310913085,
+      "step": 1090
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 1.1806390285491943,
+      "learning_rate": 3.1683333333333335e-05,
+      "loss": 1.0675930976867676,
+      "step": 1100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.47010448575019836,
+      "learning_rate": 3.151666666666667e-05,
+      "loss": 0.8236958503723144,
+      "step": 1110
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.7065098881721497,
+      "learning_rate": 3.135e-05,
+      "loss": 0.8782394409179688,
+      "step": 1120
+    },
+    {
+      "epoch": 0.37666666666666665,
+      "grad_norm": 0.7163971066474915,
+      "learning_rate": 3.118333333333334e-05,
+      "loss": 1.0409460067749023,
+      "step": 1130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.6803985834121704,
+      "learning_rate": 3.1016666666666664e-05,
+      "loss": 0.8765417098999023,
+      "step": 1140
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 0.6493039131164551,
+      "learning_rate": 3.0850000000000004e-05,
+      "loss": 0.9212311744689942,
+      "step": 1150
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.6384336352348328,
+      "learning_rate": 3.068333333333334e-05,
+      "loss": 0.9496315002441407,
+      "step": 1160
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5862751603126526,
+      "learning_rate": 3.0516666666666665e-05,
+      "loss": 0.749457836151123,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 0.5451819896697998,
+      "learning_rate": 3.035e-05,
+      "loss": 0.9255198478698731,
+      "step": 1180
+    },
+    {
+      "epoch": 0.39666666666666667,
+      "grad_norm": 0.4223293960094452,
+      "learning_rate": 3.0183333333333336e-05,
+      "loss": 0.8668063163757325,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4768493175506592,
+      "learning_rate": 3.001666666666667e-05,
+      "loss": 0.8216365814208985,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4033333333333333,
+      "grad_norm": 0.7394158840179443,
+      "learning_rate": 2.985e-05,
+      "loss": 1.0520621299743653,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 0.5049188733100891,
+      "learning_rate": 2.9683333333333334e-05,
+      "loss": 0.9788308143615723,
+      "step": 1220
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6294832825660706,
+      "learning_rate": 2.951666666666667e-05,
+      "loss": 1.0364269256591796,
+      "step": 1230
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.6358350515365601,
+      "learning_rate": 2.935e-05,
+      "loss": 0.9813390731811523,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.512421190738678,
+      "learning_rate": 2.9183333333333336e-05,
+      "loss": 0.7307010173797608,
+      "step": 1250
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4594457447528839,
+      "learning_rate": 2.901666666666667e-05,
+      "loss": 0.8606552124023438,
+      "step": 1260
+    },
+    {
+      "epoch": 0.42333333333333334,
+      "grad_norm": 0.4652048647403717,
+      "learning_rate": 2.885e-05,
+      "loss": 0.8638803482055664,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.497286319732666,
+      "learning_rate": 2.8683333333333334e-05,
+      "loss": 0.8527148246765137,
+      "step": 1280
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.5041627883911133,
+      "learning_rate": 2.851666666666667e-05,
+      "loss": 0.7727686882019043,
+      "step": 1290
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 0.6805508136749268,
+      "learning_rate": 2.8349999999999998e-05,
+      "loss": 0.9916484832763672,
+      "step": 1300
+    },
+    {
+      "epoch": 0.43666666666666665,
+      "grad_norm": 0.4343254864215851,
+      "learning_rate": 2.8183333333333335e-05,
+      "loss": 0.8907909393310547,
+      "step": 1310
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4182907044887543,
+      "learning_rate": 2.801666666666667e-05,
+      "loss": 0.8243522644042969,
+      "step": 1320
+    },
+    {
+      "epoch": 0.44333333333333336,
+      "grad_norm": 0.5033489465713501,
+      "learning_rate": 2.7850000000000003e-05,
+      "loss": 1.0239628791809081,
+      "step": 1330
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 0.5837738513946533,
+      "learning_rate": 2.7683333333333333e-05,
+      "loss": 0.9909868240356445,
+      "step": 1340
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5943430662155151,
+      "learning_rate": 2.7516666666666667e-05,
+      "loss": 0.9027081489562988,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3687169551849365,
+      "learning_rate": 2.7350000000000004e-05,
+      "loss": 0.8732491493225097,
+      "step": 1360
+    },
+    {
+      "epoch": 0.45666666666666667,
+      "grad_norm": 0.44183358550071716,
+      "learning_rate": 2.7183333333333335e-05,
+      "loss": 0.7898604869842529,
+      "step": 1370
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.525290846824646,
+      "learning_rate": 2.701666666666667e-05,
+      "loss": 0.9487957000732422,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4633333333333333,
+      "grad_norm": 0.49439457058906555,
+      "learning_rate": 2.6850000000000002e-05,
+      "loss": 0.868436050415039,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.6398065090179443,
+      "learning_rate": 2.6683333333333333e-05,
+      "loss": 0.9199989318847657,
+      "step": 1400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.5837881565093994,
+      "learning_rate": 2.6516666666666666e-05,
+      "loss": 0.8474544525146485,
+      "step": 1410
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 0.7297168374061584,
+      "learning_rate": 2.6350000000000004e-05,
+      "loss": 1.1009994506835938,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4766666666666667,
+      "grad_norm": 0.5449320077896118,
+      "learning_rate": 2.618333333333333e-05,
+      "loss": 0.8889406204223633,
+      "step": 1430
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7387002110481262,
+      "learning_rate": 2.6016666666666668e-05,
+      "loss": 0.8909475326538085,
+      "step": 1440
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 0.5363000631332397,
+      "learning_rate": 2.585e-05,
+      "loss": 0.8837484359741211,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 0.6663982272148132,
+      "learning_rate": 2.5683333333333335e-05,
+      "loss": 0.7804791927337646,
+      "step": 1460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.5026193261146545,
+      "learning_rate": 2.5516666666666666e-05,
+      "loss": 0.9184001922607422,
+      "step": 1470
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.6870279908180237,
+      "learning_rate": 2.5350000000000003e-05,
+      "loss": 0.8195085525512695,
+      "step": 1480
+    },
+    {
+      "epoch": 0.49666666666666665,
+      "grad_norm": 0.5255699157714844,
+      "learning_rate": 2.5183333333333337e-05,
+      "loss": 0.9975809097290039,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.45492592453956604,
+      "learning_rate": 2.5016666666666667e-05,
+      "loss": 0.8944621086120605,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5033333333333333,
+      "grad_norm": 0.44872063398361206,
+      "learning_rate": 2.485e-05,
+      "loss": 0.821660041809082,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.5064987540245056,
+      "learning_rate": 2.4683333333333335e-05,
+      "loss": 1.0228797912597656,
+      "step": 1520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.434779554605484,
+      "learning_rate": 2.451666666666667e-05,
+      "loss": 0.9781878471374512,
+      "step": 1530
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 0.48390141129493713,
+      "learning_rate": 2.435e-05,
+      "loss": 0.909939193725586,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 0.41258955001831055,
+      "learning_rate": 2.4183333333333336e-05,
+      "loss": 0.8889488220214844,
+      "step": 1550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.48882147669792175,
+      "learning_rate": 2.4016666666666667e-05,
+      "loss": 0.8874250411987304,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5233333333333333,
+      "grad_norm": 0.4496597647666931,
+      "learning_rate": 2.385e-05,
+      "loss": 0.866064453125,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 0.5498498678207397,
+      "learning_rate": 2.3683333333333334e-05,
+      "loss": 1.0485063552856446,
+      "step": 1580
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.5302222967147827,
+      "learning_rate": 2.3516666666666668e-05,
+      "loss": 0.79019775390625,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.6240465641021729,
+      "learning_rate": 2.3350000000000002e-05,
+      "loss": 0.8068696975708007,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5366666666666666,
+      "grad_norm": 0.49114975333213806,
+      "learning_rate": 2.3183333333333336e-05,
+      "loss": 0.9069293975830078,
+      "step": 1610
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.558907687664032,
+      "learning_rate": 2.3016666666666666e-05,
+      "loss": 0.7732550144195557,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5433333333333333,
+      "grad_norm": 0.7307827472686768,
+      "learning_rate": 2.2850000000000003e-05,
+      "loss": 0.8363723754882812,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.5290479063987732,
+      "learning_rate": 2.2683333333333334e-05,
+      "loss": 0.8145934104919433,
+      "step": 1640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.5289633870124817,
+      "learning_rate": 2.2516666666666667e-05,
+      "loss": 0.8467626571655273,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 0.42270180583000183,
+      "learning_rate": 2.235e-05,
+      "loss": 1.0054572105407715,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5566666666666666,
+      "grad_norm": 0.46330273151397705,
+      "learning_rate": 2.2183333333333335e-05,
+      "loss": 0.8821262359619141,
+      "step": 1670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.54402756690979,
+      "learning_rate": 2.201666666666667e-05,
+      "loss": 0.8570803642272949,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5633333333333334,
+      "grad_norm": 0.4660607576370239,
+      "learning_rate": 2.1850000000000003e-05,
+      "loss": 0.923713493347168,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 0.43630075454711914,
+      "learning_rate": 2.1683333333333333e-05,
+      "loss": 0.8405223846435547,
+      "step": 1700
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.48713231086730957,
+      "learning_rate": 2.1516666666666667e-05,
+      "loss": 1.0100667953491211,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5924938321113586,
+      "learning_rate": 2.135e-05,
+      "loss": 0.9960016250610352,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5766666666666667,
+      "grad_norm": 0.5111542344093323,
+      "learning_rate": 2.1183333333333334e-05,
+      "loss": 0.89755220413208,
+      "step": 1730
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4552167057991028,
+      "learning_rate": 2.1016666666666668e-05,
+      "loss": 0.8854806900024415,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.554473876953125,
+      "learning_rate": 2.085e-05,
+      "loss": 0.7076638698577881,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.5026177167892456,
+      "learning_rate": 2.0683333333333336e-05,
+      "loss": 0.9850486755371094,
+      "step": 1760
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5753727555274963,
+      "learning_rate": 2.0516666666666666e-05,
+      "loss": 0.9340484619140625,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 0.5122212171554565,
+      "learning_rate": 2.035e-05,
+      "loss": 0.9032992362976074,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5966666666666667,
+      "grad_norm": 0.5792819857597351,
+      "learning_rate": 2.0183333333333334e-05,
+      "loss": 0.8477163314819336,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5964590907096863,
+      "learning_rate": 2.0016666666666668e-05,
+      "loss": 0.9166988372802735,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6033333333333334,
+      "grad_norm": 0.7182376980781555,
+      "learning_rate": 1.985e-05,
+      "loss": 0.8029914855957031,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 0.3775170147418976,
+      "learning_rate": 1.9683333333333335e-05,
+      "loss": 0.8039090156555175,
+      "step": 1820
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.3779233694076538,
+      "learning_rate": 1.9516666666666666e-05,
+      "loss": 0.7587248802185058,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.529349684715271,
+      "learning_rate": 1.9350000000000003e-05,
+      "loss": 0.9525286674499511,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 0.8576200604438782,
+      "learning_rate": 1.9183333333333333e-05,
+      "loss": 0.7803131103515625,
+      "step": 1850
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.5265026092529297,
+      "learning_rate": 1.901666666666667e-05,
+      "loss": 0.9432580947875977,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6233333333333333,
+      "grad_norm": 0.43818992376327515,
+      "learning_rate": 1.885e-05,
+      "loss": 0.8160367012023926,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.5307653546333313,
+      "learning_rate": 1.8683333333333335e-05,
+      "loss": 0.7481701850891114,
+      "step": 1880
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.4350138008594513,
+      "learning_rate": 1.851666666666667e-05,
+      "loss": 0.9646284103393554,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 0.7320852279663086,
+      "learning_rate": 1.8350000000000002e-05,
+      "loss": 0.964715576171875,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6366666666666667,
+      "grad_norm": 0.39258873462677,
+      "learning_rate": 1.8183333333333336e-05,
+      "loss": 0.8884981155395508,
+      "step": 1910
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3995574116706848,
+      "learning_rate": 1.8016666666666666e-05,
+      "loss": 0.8017918586730957,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6433333333333333,
+      "grad_norm": 0.5143277049064636,
+      "learning_rate": 1.785e-05,
+      "loss": 0.9140171051025391,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 0.4470940828323364,
+      "learning_rate": 1.7683333333333334e-05,
+      "loss": 0.9137473106384277,
+      "step": 1940
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.4530799388885498,
+      "learning_rate": 1.7516666666666668e-05,
+      "loss": 0.9416312217712403,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.5486093163490295,
+      "learning_rate": 1.7349999999999998e-05,
+      "loss": 0.9309564590454101,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6566666666666666,
+      "grad_norm": 0.7241241931915283,
+      "learning_rate": 1.7183333333333335e-05,
+      "loss": 0.8793378829956054,
+      "step": 1970
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.48900172114372253,
+      "learning_rate": 1.7016666666666666e-05,
+      "loss": 0.9930448532104492,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6633333333333333,
+      "grad_norm": 0.5354626178741455,
+      "learning_rate": 1.6850000000000003e-05,
+      "loss": 0.8400119781494141,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5346295237541199,
+      "learning_rate": 1.6683333333333333e-05,
+      "loss": 0.7424459457397461,
+      "step": 2000
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7362031936645508,
+      "learning_rate": 1.6516666666666667e-05,
+      "loss": 0.836764907836914,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6733333333333333,
+      "grad_norm": 0.8799192905426025,
+      "learning_rate": 1.635e-05,
+      "loss": 0.9453885078430175,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6766666666666666,
+      "grad_norm": 0.5263342261314392,
+      "learning_rate": 1.6183333333333335e-05,
+      "loss": 0.7978546142578125,
+      "step": 2030
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6986008286476135,
+      "learning_rate": 1.601666666666667e-05,
+      "loss": 0.8797599792480468,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 0.7081782221794128,
+      "learning_rate": 1.5850000000000002e-05,
+      "loss": 1.0752653121948241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6866666666666666,
+      "grad_norm": 0.5002477765083313,
+      "learning_rate": 1.5683333333333333e-05,
+      "loss": 0.7971479892730713,
+      "step": 2060
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.4529975354671478,
+      "learning_rate": 1.5516666666666667e-05,
+      "loss": 0.912747859954834,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.48189014196395874,
+      "learning_rate": 1.535e-05,
+      "loss": 1.0259061813354493,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6966666666666667,
+      "grad_norm": 0.6560697555541992,
+      "learning_rate": 1.5183333333333333e-05,
+      "loss": 1.0107606887817382,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.722689151763916,
+      "learning_rate": 1.5016666666666668e-05,
+      "loss": 0.962984848022461,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7033333333333334,
+      "grad_norm": 1.0071346759796143,
+      "learning_rate": 1.485e-05,
+      "loss": 0.9162399291992187,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.5007173418998718,
+      "learning_rate": 1.4683333333333336e-05,
+      "loss": 0.8683804512023926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.4455113708972931,
+      "learning_rate": 1.4516666666666668e-05,
+      "loss": 0.929558277130127,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7133333333333334,
+      "grad_norm": 0.5244899392127991,
+      "learning_rate": 1.435e-05,
+      "loss": 0.7725494861602783,
+      "step": 2140
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 0.5691429376602173,
+      "learning_rate": 1.4183333333333335e-05,
+      "loss": 0.9669612884521485,
+      "step": 2150
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5419687032699585,
+      "learning_rate": 1.4016666666666667e-05,
+      "loss": 0.9267525672912598,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7233333333333334,
+      "grad_norm": 0.9937120079994202,
+      "learning_rate": 1.3850000000000001e-05,
+      "loss": 0.8306878089904786,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7266666666666667,
+      "grad_norm": 0.4639163017272949,
+      "learning_rate": 1.3683333333333333e-05,
+      "loss": 0.8372581481933594,
+      "step": 2180
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.45630142092704773,
+      "learning_rate": 1.3516666666666667e-05,
+      "loss": 0.8593014717102051,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4326620399951935,
+      "learning_rate": 1.3350000000000001e-05,
+      "loss": 0.8103547096252441,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7366666666666667,
+      "grad_norm": 0.48703309893608093,
+      "learning_rate": 1.3183333333333333e-05,
+      "loss": 0.8958615303039551,
+      "step": 2210
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5286509394645691,
+      "learning_rate": 1.3016666666666669e-05,
+      "loss": 0.9600817680358886,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7433333333333333,
+      "grad_norm": 0.6584081649780273,
+      "learning_rate": 1.285e-05,
+      "loss": 1.0164281845092773,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.5143536925315857,
+      "learning_rate": 1.2683333333333333e-05,
+      "loss": 1.0931424140930175,
+      "step": 2240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9453914165496826,
+      "learning_rate": 1.2516666666666668e-05,
+      "loss": 0.7825816154479981,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7533333333333333,
+      "grad_norm": 0.48963093757629395,
+      "learning_rate": 1.235e-05,
+      "loss": 0.9285711288452149,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7566666666666667,
+      "grad_norm": 0.5854438543319702,
+      "learning_rate": 1.2183333333333334e-05,
+      "loss": 0.8443680763244629,
+      "step": 2270
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.49953049421310425,
+      "learning_rate": 1.2016666666666668e-05,
+      "loss": 0.8316192626953125,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7633333333333333,
+      "grad_norm": 0.6657202839851379,
+      "learning_rate": 1.185e-05,
+      "loss": 0.910405158996582,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 0.6646652221679688,
+      "learning_rate": 1.1683333333333334e-05,
+      "loss": 0.8283540725708007,
+      "step": 2300
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.49376705288887024,
+      "learning_rate": 1.1516666666666668e-05,
+      "loss": 0.8653836250305176,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.5468245148658752,
+      "learning_rate": 1.1350000000000001e-05,
+      "loss": 0.8328197479248047,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7766666666666666,
+      "grad_norm": 0.8111145496368408,
+      "learning_rate": 1.1183333333333335e-05,
+      "loss": 0.9046418190002441,
+      "step": 2330
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5041958093643188,
+      "learning_rate": 1.1016666666666667e-05,
+      "loss": 0.8311249732971191,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 0.4898006021976471,
+      "learning_rate": 1.0850000000000001e-05,
+      "loss": 0.9336203575134278,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.4351266324520111,
+      "learning_rate": 1.0683333333333333e-05,
+      "loss": 0.9776251792907715,
+      "step": 2360
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.579655647277832,
+      "learning_rate": 1.0516666666666667e-05,
+      "loss": 0.7065846443176269,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7933333333333333,
+      "grad_norm": 0.4177819788455963,
+      "learning_rate": 1.035e-05,
+      "loss": 0.7918330669403076,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7966666666666666,
+      "grad_norm": 0.4987991750240326,
+      "learning_rate": 1.0183333333333333e-05,
+      "loss": 0.8270879745483398,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6702345013618469,
+      "learning_rate": 1.0016666666666667e-05,
+      "loss": 0.7966389656066895,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8033333333333333,
+      "grad_norm": 0.684005618095398,
+      "learning_rate": 9.85e-06,
+      "loss": 0.8869472503662109,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8066666666666666,
+      "grad_norm": 0.4468795657157898,
+      "learning_rate": 9.683333333333333e-06,
+      "loss": 0.9556525230407715,
+      "step": 2420
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5046238303184509,
+      "learning_rate": 9.516666666666666e-06,
+      "loss": 0.8056395530700684,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.6720165610313416,
+      "learning_rate": 9.35e-06,
+      "loss": 0.8834376335144043,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 0.4289397597312927,
+      "learning_rate": 9.183333333333334e-06,
+      "loss": 0.7789588928222656,
+      "step": 2450
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6209238171577454,
+      "learning_rate": 9.016666666666668e-06,
+      "loss": 0.8701201438903808,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8233333333333334,
+      "grad_norm": 0.44446897506713867,
+      "learning_rate": 8.85e-06,
+      "loss": 0.7950375080108643,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.44629836082458496,
+      "learning_rate": 8.683333333333334e-06,
+      "loss": 0.7967105865478515,
+      "step": 2480
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.37678250670433044,
+      "learning_rate": 8.516666666666668e-06,
+      "loss": 0.771687650680542,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.494236558675766,
+      "learning_rate": 8.350000000000001e-06,
+      "loss": 0.8968353271484375,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8366666666666667,
+      "grad_norm": 0.6953932046890259,
+      "learning_rate": 8.183333333333333e-06,
+      "loss": 1.0087746620178222,
+      "step": 2510
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.488052099943161,
+      "learning_rate": 8.016666666666667e-06,
+      "loss": 0.9487748146057129,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8433333333333334,
+      "grad_norm": 0.7786927223205566,
+      "learning_rate": 7.850000000000001e-06,
+      "loss": 0.9541014671325684,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8466666666666667,
+      "grad_norm": 0.5057625770568848,
+      "learning_rate": 7.683333333333335e-06,
+      "loss": 0.905206298828125,
+      "step": 2540
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7162487506866455,
+      "learning_rate": 7.516666666666668e-06,
+      "loss": 0.9245425224304199,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.5543293952941895,
+      "learning_rate": 7.35e-06,
+      "loss": 0.8062684059143066,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8566666666666667,
+      "grad_norm": 0.7101190686225891,
+      "learning_rate": 7.183333333333334e-06,
+      "loss": 0.9243124961853028,
+      "step": 2570
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.726828932762146,
+      "learning_rate": 7.0166666666666675e-06,
+      "loss": 0.7988007068634033,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8633333333333333,
+      "grad_norm": 0.533573567867279,
+      "learning_rate": 6.8500000000000005e-06,
+      "loss": 0.8835041046142578,
+      "step": 2590
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.5849156975746155,
+      "learning_rate": 6.6833333333333334e-06,
+      "loss": 0.8487396240234375,
+      "step": 2600
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8949032425880432,
+      "learning_rate": 6.516666666666666e-06,
+      "loss": 0.8319039344787598,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8733333333333333,
+      "grad_norm": 0.6677828431129456,
+      "learning_rate": 6.35e-06,
+      "loss": 0.9017569541931152,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8766666666666667,
+      "grad_norm": 0.5268790125846863,
+      "learning_rate": 6.183333333333333e-06,
+      "loss": 0.7499767780303955,
+      "step": 2630
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6157758235931396,
+      "learning_rate": 6.016666666666667e-06,
+      "loss": 0.8814908027648926,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8833333333333333,
+      "grad_norm": 0.5197092890739441,
+      "learning_rate": 5.850000000000001e-06,
+      "loss": 0.8590426445007324,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8866666666666667,
+      "grad_norm": 0.7915957570075989,
+      "learning_rate": 5.683333333333334e-06,
+      "loss": 0.8466612815856933,
+      "step": 2660
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.48902902007102966,
+      "learning_rate": 5.5166666666666675e-06,
+      "loss": 0.9879349708557129,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.5069965124130249,
+      "learning_rate": 5.3500000000000004e-06,
+      "loss": 0.8216916084289551,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8966666666666666,
+      "grad_norm": 0.6190909743309021,
+      "learning_rate": 5.183333333333333e-06,
+      "loss": 0.8995295524597168,
+      "step": 2690
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.45372089743614197,
+      "learning_rate": 5.016666666666666e-06,
+      "loss": 0.8368668556213379,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9033333333333333,
+      "grad_norm": 0.5643200278282166,
+      "learning_rate": 4.85e-06,
+      "loss": 0.7782045364379883,
+      "step": 2710
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.5345107913017273,
+      "learning_rate": 4.683333333333333e-06,
+      "loss": 0.8649769783020019,
+      "step": 2720
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6174563765525818,
+      "learning_rate": 4.516666666666667e-06,
+      "loss": 0.890013313293457,
+      "step": 2730
+    },
+    {
+      "epoch": 0.9133333333333333,
+      "grad_norm": 0.6632450222969055,
+      "learning_rate": 4.35e-06,
+      "loss": 0.8954425811767578,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.5862364768981934,
+      "learning_rate": 4.183333333333334e-06,
+      "loss": 0.9733158111572265,
+      "step": 2750
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.625480592250824,
+      "learning_rate": 4.0166666666666675e-06,
+      "loss": 0.940821647644043,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9233333333333333,
+      "grad_norm": 0.5125066041946411,
+      "learning_rate": 3.85e-06,
+      "loss": 0.8799821853637695,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9266666666666666,
+      "grad_norm": 0.501059353351593,
+      "learning_rate": 3.6833333333333338e-06,
+      "loss": 0.8071253776550293,
+      "step": 2780
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.480802446603775,
+      "learning_rate": 3.5166666666666667e-06,
+      "loss": 0.7463678359985352,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.5736676454544067,
+      "learning_rate": 3.3500000000000005e-06,
+      "loss": 0.9327493667602539,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9366666666666666,
+      "grad_norm": 0.4975570738315582,
+      "learning_rate": 3.1833333333333335e-06,
+      "loss": 0.8904853820800781,
+      "step": 2810
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.480354368686676,
+      "learning_rate": 3.016666666666667e-06,
+      "loss": 1.0292061805725097,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9433333333333334,
+      "grad_norm": 0.6452360153198242,
+      "learning_rate": 2.8500000000000002e-06,
+      "loss": 1.0092354774475099,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.5015031099319458,
+      "learning_rate": 2.6833333333333336e-06,
+      "loss": 1.0683047294616699,
+      "step": 2840
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.4239721894264221,
+      "learning_rate": 2.516666666666667e-06,
+      "loss": 0.8794116973876953,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9533333333333334,
+      "grad_norm": 0.5949695110321045,
+      "learning_rate": 2.35e-06,
+      "loss": 1.0180916786193848,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9566666666666667,
+      "grad_norm": 0.551426112651825,
+      "learning_rate": 2.1833333333333333e-06,
+      "loss": 0.9177707672119141,
+      "step": 2870
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6483604907989502,
+      "learning_rate": 2.0166666666666667e-06,
+      "loss": 0.9020861625671387,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9633333333333334,
+      "grad_norm": 0.60732501745224,
+      "learning_rate": 1.85e-06,
+      "loss": 1.0001092910766602,
+      "step": 2890
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 0.46931129693984985,
+      "learning_rate": 1.6833333333333332e-06,
+      "loss": 0.8657818794250488,
+      "step": 2900
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.5216684937477112,
+      "learning_rate": 1.5166666666666668e-06,
+      "loss": 0.9077080726623535,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.547545313835144,
+      "learning_rate": 1.35e-06,
+      "loss": 0.8793766021728515,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9766666666666667,
+      "grad_norm": 0.4495963752269745,
+      "learning_rate": 1.1833333333333334e-06,
+      "loss": 0.8557974815368652,
+      "step": 2930
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6442372798919678,
+      "learning_rate": 1.0166666666666665e-06,
+      "loss": 0.8119054794311523,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9833333333333333,
+      "grad_norm": 0.7756669521331787,
+      "learning_rate": 8.500000000000001e-07,
+      "loss": 0.8289030075073243,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.4409935176372528,
+      "learning_rate": 6.833333333333334e-07,
+      "loss": 0.8499897003173829,
+      "step": 2960
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.8582751750946045,
+      "learning_rate": 5.166666666666667e-07,
+      "loss": 0.9656048774719238,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9933333333333333,
+      "grad_norm": 0.47707653045654297,
+      "learning_rate": 3.5000000000000004e-07,
+      "loss": 0.8309163093566895,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9966666666666667,
+      "grad_norm": 0.45731914043426514,
+      "learning_rate": 1.8333333333333333e-07,
+      "loss": 0.9769588470458984,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.622088611125946,
+      "learning_rate": 1.6666666666666667e-08,
+      "loss": 0.8226492881774903,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 805619367936000.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b8cdeeb02b683888c58950ba20634617e28bd81b336c3f037116cc9305a2043
+size 5137