diff --git a/.gitattributes b/.gitattributes index 8f8dd6d145f5c1752bb92b8431115cf5432440b1..5004930cb5655e4ae59473d80bd07245d91f85b6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -866,3 +866,12 @@ gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d- gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-4/checkpoint-49/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-4/checkpoint-98/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a25de9298ae67ee0c6a132a503d40bed8ca8c54 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e46139fbdbe7c9dba8da0dc7642049d7fa284650682279a059b4fd5f0e13ba +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7bd77e87dedf76c787f4902bc2dd2c5d416b5bfe --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82cb608022eb1e5aa4f5ec3ee9388bb9bea9fb6045e2196e29dd6bbb62b0a9e7 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..524eb41dbb8f8fe2db2b0f0bedb5a885ace85621 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30105b3780bd608eb8fdffd04d7d5b7c9e01bb1f83899e5684f65b26608103f +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ca5d1b71495d675490231c16589dcd481f0db2e --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40504680aa3f8504409b92ec31449b9e1b0a70e5047c0cc85fdc9743e92922cc +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a00200e9ee54972a74baebcb9bb5376cec0bb86 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e3656ae7e8e59bbccc6990a75d6003a70342a36e1351fe42c68581b546391f +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..824ce29fe6338fd8c98d60591ec6be99311b1796 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/trainer_state.json @@ -0,0 +1,7586 @@ +{ + "best_metric": 1.4113320112228394, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 10710, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + }, + { + "epoch": 3.0065359477124183, + "grad_norm": 0.6926420331001282, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4600 + }, + { + "epoch": 3.0130718954248366, + "grad_norm": 0.6656355857849121, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 4610 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.9901936650276184, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 4620 + }, + { + "epoch": 3.026143790849673, + "grad_norm": 0.6713474988937378, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 4630 + }, + { + "epoch": 3.0326797385620914, + "grad_norm": 0.6199324131011963, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 4640 + }, + { + "epoch": 3.0392156862745097, + "grad_norm": 0.7180785536766052, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 4650 + }, + { + "epoch": 3.045751633986928, + "grad_norm": 0.8256588578224182, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 4660 + }, + { + "epoch": 3.052287581699346, + "grad_norm": 0.6637389063835144, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 4670 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.6980698108673096, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 4680 + }, + { + "epoch": 3.065359477124183, + "grad_norm": 0.8091534972190857, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 4690 + }, + { + "epoch": 3.0718954248366015, + "grad_norm": 0.5715174078941345, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 4700 + }, + { + "epoch": 3.0784313725490198, + "grad_norm": 0.735639750957489, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 4710 + }, + { + "epoch": 3.084967320261438, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.0002, + "loss": 1.1522, + "step": 4720 + }, + { + "epoch": 3.0915032679738563, + "grad_norm": 1.263566017150879, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 4730 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.6600871682167053, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4740 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 0.717792809009552, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 4750 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.853714644908905, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 4760 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.1004153490066528, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 4770 + }, + { + "epoch": 3.1241830065359477, + "grad_norm": 0.8566235899925232, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 4780 + }, + { + "epoch": 3.130718954248366, + "grad_norm": 0.8315296173095703, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 4790 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8020524978637695, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 4800 + }, + { + "epoch": 3.1437908496732025, + "grad_norm": 0.7564275860786438, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 4810 + }, + { + "epoch": 3.150326797385621, + "grad_norm": 0.9077776670455933, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 4820 + }, + { + "epoch": 3.156862745098039, + "grad_norm": 0.6323099732398987, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 4830 + }, + { + "epoch": 3.1633986928104574, + "grad_norm": 0.6625368595123291, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 3.1699346405228757, + "grad_norm": 0.8119261860847473, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 4850 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.6399450898170471, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 4860 + }, + { + "epoch": 3.183006535947712, + "grad_norm": 1.0659016370773315, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 4870 + }, + { + "epoch": 3.189542483660131, + "grad_norm": 0.8040369749069214, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4880 + }, + { + "epoch": 3.196078431372549, + "grad_norm": 0.7784733176231384, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4890 + }, + { + "epoch": 3.2026143790849675, + "grad_norm": 0.9660294651985168, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4900 + }, + { + "epoch": 3.2091503267973858, + "grad_norm": 1.0676977634429932, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 4910 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.5877565741539001, + "learning_rate": 0.0002, + "loss": 1.0083, + "step": 4920 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.6164032816886902, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 4930 + }, + { + "epoch": 3.2287581699346406, + "grad_norm": 0.7627606987953186, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 4940 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 0.7442803978919983, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 4950 + }, + { + "epoch": 3.241830065359477, + "grad_norm": 0.7277812361717224, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 4960 + }, + { + "epoch": 3.2483660130718954, + "grad_norm": 1.0301902294158936, + "learning_rate": 0.0002, + "loss": 1.2237, + "step": 4970 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.7798232436180115, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 4980 + }, + { + "epoch": 3.261437908496732, + "grad_norm": 1.210265874862671, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 4990 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 0.6677713990211487, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 5000 + }, + { + "epoch": 3.2745098039215685, + "grad_norm": 1.0524500608444214, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 5010 + }, + { + "epoch": 3.281045751633987, + "grad_norm": 0.7091745734214783, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5020 + }, + { + "epoch": 3.287581699346405, + "grad_norm": 0.8523224592208862, + "learning_rate": 0.0002, + "loss": 1.1891, + "step": 5030 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.6120608448982239, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 5040 + }, + { + "epoch": 3.3006535947712417, + "grad_norm": 0.7437472939491272, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 5050 + }, + { + "epoch": 3.30718954248366, + "grad_norm": 0.7611715197563171, + "learning_rate": 0.0002, + "loss": 1.1295, + "step": 5060 + }, + { + "epoch": 3.313725490196078, + "grad_norm": 0.7249704003334045, + "learning_rate": 0.0002, + "loss": 1.0531, + "step": 5070 + }, + { + "epoch": 3.3202614379084965, + "grad_norm": 0.7316247820854187, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5080 + }, + { + "epoch": 3.326797385620915, + "grad_norm": 0.562412440776825, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5090 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7052176594734192, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5100 + }, + { + "epoch": 3.3398692810457518, + "grad_norm": 0.7714211344718933, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 5110 + }, + { + "epoch": 3.34640522875817, + "grad_norm": 1.0436055660247803, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5120 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.8867271542549133, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 5130 + }, + { + "epoch": 3.3594771241830066, + "grad_norm": 0.8371267914772034, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 5140 + }, + { + "epoch": 3.366013071895425, + "grad_norm": 0.7257837057113647, + "learning_rate": 0.0002, + "loss": 1.1073, + "step": 5150 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.7102002501487732, + "learning_rate": 0.0002, + "loss": 1.1162, + "step": 5160 + }, + { + "epoch": 3.3790849673202614, + "grad_norm": 0.7636350393295288, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 5170 + }, + { + "epoch": 3.3856209150326797, + "grad_norm": 0.6887359619140625, + "learning_rate": 0.0002, + "loss": 1.0708, + "step": 5180 + }, + { + "epoch": 3.392156862745098, + "grad_norm": 0.8141424655914307, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 5190 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.694423496723175, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5200 + }, + { + "epoch": 3.4052287581699345, + "grad_norm": 0.914013683795929, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5210 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.8503239750862122, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 5220 + }, + { + "epoch": 3.418300653594771, + "grad_norm": 0.6196836233139038, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 5230 + }, + { + "epoch": 3.4248366013071894, + "grad_norm": 1.0760811567306519, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 5240 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 0.6524698138237, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 5250 + }, + { + "epoch": 3.4379084967320264, + "grad_norm": 0.674467921257019, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5260 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7690372467041016, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 5270 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.8751813769340515, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 5280 + }, + { + "epoch": 3.457516339869281, + "grad_norm": 0.750407874584198, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 5290 + }, + { + "epoch": 3.4640522875816995, + "grad_norm": 0.5991823077201843, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 5300 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.0164772272109985, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 5310 + }, + { + "epoch": 3.477124183006536, + "grad_norm": 0.8704105019569397, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 5320 + }, + { + "epoch": 3.4836601307189543, + "grad_norm": 0.709102213382721, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 5330 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.6273632049560547, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 5340 + }, + { + "epoch": 3.496732026143791, + "grad_norm": 0.6807359457015991, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 5350 + }, + { + "epoch": 3.503267973856209, + "grad_norm": 0.7085188627243042, + "learning_rate": 0.0002, + "loss": 1.131, + "step": 5360 + }, + { + "epoch": 3.5098039215686274, + "grad_norm": 0.6938307881355286, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5370 + }, + { + "epoch": 3.5163398692810457, + "grad_norm": 0.8544146418571472, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 5380 + }, + { + "epoch": 3.522875816993464, + "grad_norm": 0.7889642119407654, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 5390 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.7858421206474304, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 5400 + }, + { + "epoch": 3.5359477124183005, + "grad_norm": 0.8547123074531555, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5410 + }, + { + "epoch": 3.542483660130719, + "grad_norm": 0.8218181133270264, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 5420 + }, + { + "epoch": 3.549019607843137, + "grad_norm": 1.153623342514038, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 5430 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.1321099996566772, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 5440 + }, + { + "epoch": 3.5620915032679736, + "grad_norm": 0.9495334029197693, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 5450 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.8743821978569031, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 5460 + }, + { + "epoch": 3.57516339869281, + "grad_norm": 0.7513086795806885, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 5470 + }, + { + "epoch": 3.581699346405229, + "grad_norm": 1.0139480829238892, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 5480 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 0.6615135073661804, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 5490 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 1.180798888206482, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 5500 + }, + { + "epoch": 3.6013071895424837, + "grad_norm": 0.7085279226303101, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 5510 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.540268063545227, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 5520 + }, + { + "epoch": 3.6143790849673203, + "grad_norm": 0.7905671000480652, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 5530 + }, + { + "epoch": 3.6209150326797386, + "grad_norm": 0.8457717299461365, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 5540 + }, + { + "epoch": 3.627450980392157, + "grad_norm": 0.7102677822113037, + "learning_rate": 0.0002, + "loss": 1.1799, + "step": 5550 + }, + { + "epoch": 3.633986928104575, + "grad_norm": 0.7179514765739441, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 5560 + }, + { + "epoch": 3.6405228758169934, + "grad_norm": 1.0854148864746094, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 5570 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.8209951519966125, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5580 + }, + { + "epoch": 3.65359477124183, + "grad_norm": 0.6944138407707214, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 5590 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.7675473093986511, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 5600 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 5610 + }, + { + "epoch": 3.6732026143790852, + "grad_norm": 0.7920727133750916, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 5620 + }, + { + "epoch": 3.6797385620915035, + "grad_norm": 0.9440218806266785, + "learning_rate": 0.0002, + "loss": 1.2287, + "step": 5630 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.6600824594497681, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 5640 + }, + { + "epoch": 3.69281045751634, + "grad_norm": 0.6860619187355042, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 5650 + }, + { + "epoch": 3.6993464052287583, + "grad_norm": 0.6579713225364685, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 5660 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.661081075668335, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 5670 + }, + { + "epoch": 3.712418300653595, + "grad_norm": 1.0968825817108154, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 5680 + }, + { + "epoch": 3.718954248366013, + "grad_norm": 0.8066844940185547, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 5690 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.8341682553291321, + "learning_rate": 0.0002, + "loss": 1.2322, + "step": 5700 + }, + { + "epoch": 3.7320261437908497, + "grad_norm": 0.6682852506637573, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 5710 + }, + { + "epoch": 3.738562091503268, + "grad_norm": 0.898595929145813, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 5720 + }, + { + "epoch": 3.7450980392156863, + "grad_norm": 0.6876054406166077, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 5730 + }, + { + "epoch": 3.7516339869281046, + "grad_norm": 0.7817103266716003, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 5740 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 0.5840168595314026, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 5750 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.6263918876647949, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5760 + }, + { + "epoch": 3.7712418300653594, + "grad_norm": 0.7948952317237854, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 5770 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6700998544692993, + "learning_rate": 0.0002, + "loss": 1.149, + "step": 5780 + }, + { + "epoch": 3.784313725490196, + "grad_norm": 1.1169519424438477, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 5790 + }, + { + "epoch": 3.7908496732026142, + "grad_norm": 0.8354471325874329, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5800 + }, + { + "epoch": 3.7973856209150325, + "grad_norm": 0.6304181814193726, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 5810 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.6919655799865723, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 5820 + }, + { + "epoch": 3.810457516339869, + "grad_norm": 0.600385844707489, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5830 + }, + { + "epoch": 3.8169934640522873, + "grad_norm": 0.8406319618225098, + "learning_rate": 0.0002, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 0.7594282031059265, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5850 + }, + { + "epoch": 3.8300653594771243, + "grad_norm": 0.8179879784584045, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 5860 + }, + { + "epoch": 3.8366013071895426, + "grad_norm": 1.141430377960205, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 5870 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.6595550775527954, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 5880 + }, + { + "epoch": 3.849673202614379, + "grad_norm": 0.7499435544013977, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 5890 + }, + { + "epoch": 3.8562091503267975, + "grad_norm": 0.7851517200469971, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5900 + }, + { + "epoch": 3.8627450980392157, + "grad_norm": 1.0533545017242432, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 5910 + }, + { + "epoch": 3.869281045751634, + "grad_norm": 0.960086464881897, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5920 + }, + { + "epoch": 3.8758169934640523, + "grad_norm": 0.9952049851417542, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 5930 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.7884191274642944, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 5940 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.7461766600608826, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5950 + }, + { + "epoch": 3.895424836601307, + "grad_norm": 0.9594355821609497, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 5960 + }, + { + "epoch": 3.9019607843137254, + "grad_norm": 0.8179471492767334, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 5970 + }, + { + "epoch": 3.9084967320261437, + "grad_norm": 0.8240267634391785, + "learning_rate": 0.0002, + "loss": 1.2421, + "step": 5980 + }, + { + "epoch": 3.915032679738562, + "grad_norm": 0.7462618350982666, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 5990 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.711207389831543, + "learning_rate": 0.0002, + "loss": 1.2124, + "step": 6000 + }, + { + "epoch": 3.928104575163399, + "grad_norm": 0.6910956501960754, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 6010 + }, + { + "epoch": 3.9346405228758172, + "grad_norm": 0.749093770980835, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 6020 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.3332762718200684, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6030 + }, + { + "epoch": 3.947712418300654, + "grad_norm": 0.71457439661026, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6040 + }, + { + "epoch": 3.954248366013072, + "grad_norm": 1.1205238103866577, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 6050 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.6958928108215332, + "learning_rate": 0.0002, + "loss": 1.2962, + "step": 6060 + }, + { + "epoch": 3.9673202614379086, + "grad_norm": 0.7518056035041809, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 6070 + }, + { + "epoch": 3.973856209150327, + "grad_norm": 0.8010755777359009, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 6080 + }, + { + "epoch": 3.980392156862745, + "grad_norm": 0.7492658495903015, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6090 + }, + { + "epoch": 3.9869281045751634, + "grad_norm": 0.900704562664032, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 6100 + }, + { + "epoch": 3.9934640522875817, + "grad_norm": 0.7997331619262695, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 6110 + }, + { + "epoch": 4.0, + "grad_norm": 0.7163209319114685, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 6120 + }, + { + "epoch": 4.0, + "eval_loss": 1.4113320112228394, + "eval_runtime": 33.7199, + "eval_samples_per_second": 12.93, + "eval_steps_per_second": 1.631, + "step": 6120 + }, + { + "epoch": 4.006535947712418, + "grad_norm": 0.9527022838592529, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 6130 + }, + { + "epoch": 4.0130718954248366, + "grad_norm": 0.7603210210800171, + "learning_rate": 0.0002, + "loss": 1.101, + "step": 6140 + }, + { + "epoch": 4.019607843137255, + "grad_norm": 1.127387523651123, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 6150 + }, + { + "epoch": 4.026143790849673, + "grad_norm": 0.8290133476257324, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 6160 + }, + { + "epoch": 4.032679738562091, + "grad_norm": 0.9912241101264954, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 6170 + }, + { + "epoch": 4.03921568627451, + "grad_norm": 0.947005033493042, + "learning_rate": 0.0002, + "loss": 1.0719, + "step": 6180 + }, + { + "epoch": 4.045751633986928, + "grad_norm": 0.707466185092926, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 6190 + }, + { + "epoch": 4.052287581699346, + "grad_norm": 1.0604327917099, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6200 + }, + { + "epoch": 4.0588235294117645, + "grad_norm": 0.7848685383796692, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6210 + }, + { + "epoch": 4.065359477124183, + "grad_norm": 0.8475256562232971, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 6220 + }, + { + "epoch": 4.071895424836601, + "grad_norm": 0.9759448766708374, + "learning_rate": 0.0002, + "loss": 1.1104, + "step": 6230 + }, + { + "epoch": 4.078431372549019, + "grad_norm": 0.9324519038200378, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 6240 + }, + { + "epoch": 4.084967320261438, + "grad_norm": 0.8723901510238647, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 6250 + }, + { + "epoch": 4.091503267973856, + "grad_norm": 0.8343415856361389, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6260 + }, + { + "epoch": 4.098039215686274, + "grad_norm": 0.7490310072898865, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 6270 + }, + { + "epoch": 4.104575163398692, + "grad_norm": 0.8961182832717896, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 6280 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7124854922294617, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 6290 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.8338138461112976, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 6300 + }, + { + "epoch": 4.124183006535947, + "grad_norm": 0.8075833320617676, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 6310 + }, + { + "epoch": 4.130718954248366, + "grad_norm": 0.8069391846656799, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 6320 + }, + { + "epoch": 4.137254901960785, + "grad_norm": 0.9567893147468567, + "learning_rate": 0.0002, + "loss": 0.948, + "step": 6330 + }, + { + "epoch": 4.143790849673203, + "grad_norm": 1.2184662818908691, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 6340 + }, + { + "epoch": 4.150326797385621, + "grad_norm": 1.030976414680481, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 6350 + }, + { + "epoch": 4.1568627450980395, + "grad_norm": 0.9749957323074341, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6360 + }, + { + "epoch": 4.163398692810458, + "grad_norm": 0.7089483141899109, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 6370 + }, + { + "epoch": 4.169934640522876, + "grad_norm": 1.1084946393966675, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 6380 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 0.7998497486114502, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 6390 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 0.8997811675071716, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 6400 + }, + { + "epoch": 4.189542483660131, + "grad_norm": 0.8359479904174805, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 6410 + }, + { + "epoch": 4.196078431372549, + "grad_norm": 0.9087472558021545, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 6420 + }, + { + "epoch": 4.2026143790849675, + "grad_norm": 1.1100451946258545, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 6430 + }, + { + "epoch": 4.209150326797386, + "grad_norm": 0.9376999735832214, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 6440 + }, + { + "epoch": 4.215686274509804, + "grad_norm": 0.8179266452789307, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 6450 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9953271746635437, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 6460 + }, + { + "epoch": 4.228758169934641, + "grad_norm": 0.8476650714874268, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6470 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.8406323194503784, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6480 + }, + { + "epoch": 4.241830065359477, + "grad_norm": 0.819134533405304, + "learning_rate": 0.0002, + "loss": 1.057, + "step": 6490 + }, + { + "epoch": 4.248366013071895, + "grad_norm": 0.7764983773231506, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 6500 + }, + { + "epoch": 4.254901960784314, + "grad_norm": 0.8252112865447998, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 6510 + }, + { + "epoch": 4.261437908496732, + "grad_norm": 0.7941019535064697, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6520 + }, + { + "epoch": 4.26797385620915, + "grad_norm": 0.7673905491828918, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 6530 + }, + { + "epoch": 4.2745098039215685, + "grad_norm": 0.8749890327453613, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6540 + }, + { + "epoch": 4.281045751633987, + "grad_norm": 0.7343207597732544, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 4.287581699346405, + "grad_norm": 1.2786651849746704, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6560 + }, + { + "epoch": 4.294117647058823, + "grad_norm": 1.316875696182251, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6570 + }, + { + "epoch": 4.300653594771242, + "grad_norm": 0.8349189162254333, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 6580 + }, + { + "epoch": 4.30718954248366, + "grad_norm": 0.7510647177696228, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6590 + }, + { + "epoch": 4.313725490196078, + "grad_norm": 0.932420551776886, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6600 + }, + { + "epoch": 4.3202614379084965, + "grad_norm": 0.8510616421699524, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 6610 + }, + { + "epoch": 4.326797385620915, + "grad_norm": 0.7661547064781189, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6620 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.0370930433273315, + "learning_rate": 0.0002, + "loss": 1.2064, + "step": 6630 + }, + { + "epoch": 4.339869281045751, + "grad_norm": 0.9302158951759338, + "learning_rate": 0.0002, + "loss": 1.1064, + "step": 6640 + }, + { + "epoch": 4.34640522875817, + "grad_norm": 0.9203811883926392, + "learning_rate": 0.0002, + "loss": 0.968, + "step": 6650 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.9986332654953003, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 6660 + }, + { + "epoch": 4.359477124183006, + "grad_norm": 0.8001713156700134, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6670 + }, + { + "epoch": 4.366013071895424, + "grad_norm": 0.829714298248291, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 6680 + }, + { + "epoch": 4.372549019607844, + "grad_norm": 0.8253079056739807, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 6690 + }, + { + "epoch": 4.379084967320262, + "grad_norm": 0.824666440486908, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 6700 + }, + { + "epoch": 4.38562091503268, + "grad_norm": 0.8872972130775452, + "learning_rate": 0.0002, + "loss": 1.1968, + "step": 6710 + }, + { + "epoch": 4.392156862745098, + "grad_norm": 0.8729761838912964, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 6720 + }, + { + "epoch": 4.398692810457517, + "grad_norm": 1.1367264986038208, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 6730 + }, + { + "epoch": 4.405228758169935, + "grad_norm": 0.9699058532714844, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 6740 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 0.8266763687133789, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 6750 + }, + { + "epoch": 4.4183006535947715, + "grad_norm": 1.0249767303466797, + "learning_rate": 0.0002, + "loss": 1.0735, + "step": 6760 + }, + { + "epoch": 4.42483660130719, + "grad_norm": 0.73606938123703, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 6770 + }, + { + "epoch": 4.431372549019608, + "grad_norm": 1.4050679206848145, + "learning_rate": 0.0002, + "loss": 1.1037, + "step": 6780 + }, + { + "epoch": 4.437908496732026, + "grad_norm": 1.1114081144332886, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6790 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8031067848205566, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 6800 + }, + { + "epoch": 4.450980392156863, + "grad_norm": 0.8513566851615906, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6810 + }, + { + "epoch": 4.457516339869281, + "grad_norm": 1.332741379737854, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 6820 + }, + { + "epoch": 4.4640522875816995, + "grad_norm": 1.5032578706741333, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 6830 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.7677283883094788, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6840 + }, + { + "epoch": 4.477124183006536, + "grad_norm": 0.989148736000061, + "learning_rate": 0.0002, + "loss": 1.1501, + "step": 6850 + }, + { + "epoch": 4.483660130718954, + "grad_norm": 1.5316275358200073, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6860 + }, + { + "epoch": 4.490196078431373, + "grad_norm": 0.9427124261856079, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 6870 + }, + { + "epoch": 4.496732026143791, + "grad_norm": 1.215287685394287, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 6880 + }, + { + "epoch": 4.503267973856209, + "grad_norm": 0.7286760210990906, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 6890 + }, + { + "epoch": 4.509803921568627, + "grad_norm": 0.874829888343811, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6900 + }, + { + "epoch": 4.516339869281046, + "grad_norm": 0.8058359622955322, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 6910 + }, + { + "epoch": 4.522875816993464, + "grad_norm": 1.248195767402649, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 0.8033645749092102, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 6930 + }, + { + "epoch": 4.5359477124183005, + "grad_norm": 1.7361950874328613, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 6940 + }, + { + "epoch": 4.542483660130719, + "grad_norm": 0.8058095574378967, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 6950 + }, + { + "epoch": 4.549019607843137, + "grad_norm": 1.254089593887329, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 6960 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.9180455803871155, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 6970 + }, + { + "epoch": 4.562091503267974, + "grad_norm": 0.6677682399749756, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 6980 + }, + { + "epoch": 4.568627450980392, + "grad_norm": 0.8127354383468628, + "learning_rate": 0.0002, + "loss": 1.0453, + "step": 6990 + }, + { + "epoch": 4.57516339869281, + "grad_norm": 1.0263001918792725, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7000 + }, + { + "epoch": 4.5816993464052285, + "grad_norm": 0.9641909003257751, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 7010 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 0.9440861344337463, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 7020 + }, + { + "epoch": 4.594771241830065, + "grad_norm": 0.9539011716842651, + "learning_rate": 0.0002, + "loss": 1.0931, + "step": 7030 + }, + { + "epoch": 4.601307189542483, + "grad_norm": 1.0449910163879395, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7040 + }, + { + "epoch": 4.607843137254902, + "grad_norm": 0.8766893744468689, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7050 + }, + { + "epoch": 4.61437908496732, + "grad_norm": 0.6983462572097778, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 7060 + }, + { + "epoch": 4.620915032679738, + "grad_norm": 0.9505505561828613, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 7070 + }, + { + "epoch": 4.627450980392156, + "grad_norm": 1.2506657838821411, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7080 + }, + { + "epoch": 4.633986928104575, + "grad_norm": 0.9602801203727722, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 7090 + }, + { + "epoch": 4.640522875816993, + "grad_norm": 0.7398977875709534, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7100 + }, + { + "epoch": 4.647058823529412, + "grad_norm": 1.3862425088882446, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 7110 + }, + { + "epoch": 4.65359477124183, + "grad_norm": 1.1451990604400635, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 7120 + }, + { + "epoch": 4.660130718954249, + "grad_norm": 0.9010422229766846, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 7130 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.7102518081665039, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 7140 + }, + { + "epoch": 4.673202614379085, + "grad_norm": 0.7963796257972717, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7150 + }, + { + "epoch": 4.6797385620915035, + "grad_norm": 0.7726007699966431, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 7160 + }, + { + "epoch": 4.686274509803922, + "grad_norm": 0.8097564578056335, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 7170 + }, + { + "epoch": 4.69281045751634, + "grad_norm": 0.9070925116539001, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 7180 + }, + { + "epoch": 4.699346405228758, + "grad_norm": 0.7543528079986572, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 7190 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9900904893875122, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7200 + }, + { + "epoch": 4.712418300653595, + "grad_norm": 0.8033412098884583, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 7210 + }, + { + "epoch": 4.718954248366013, + "grad_norm": 0.8440839052200317, + "learning_rate": 0.0002, + "loss": 1.1773, + "step": 7220 + }, + { + "epoch": 4.7254901960784315, + "grad_norm": 0.9325555562973022, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 7230 + }, + { + "epoch": 4.73202614379085, + "grad_norm": 0.7881146669387817, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7240 + }, + { + "epoch": 4.738562091503268, + "grad_norm": 0.884453296661377, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7250 + }, + { + "epoch": 4.745098039215686, + "grad_norm": 0.9274539351463318, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7260 + }, + { + "epoch": 4.751633986928105, + "grad_norm": 1.2367479801177979, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7270 + }, + { + "epoch": 4.758169934640523, + "grad_norm": 0.9499821066856384, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7280 + }, + { + "epoch": 4.764705882352941, + "grad_norm": 2.1918580532073975, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 7290 + }, + { + "epoch": 4.771241830065359, + "grad_norm": 0.8221880793571472, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 7300 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.871972918510437, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7310 + }, + { + "epoch": 4.784313725490196, + "grad_norm": 0.8034510612487793, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 7320 + }, + { + "epoch": 4.790849673202614, + "grad_norm": 0.8959605693817139, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 7330 + }, + { + "epoch": 4.7973856209150325, + "grad_norm": 1.2326215505599976, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7340 + }, + { + "epoch": 4.803921568627451, + "grad_norm": 0.9725791811943054, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7350 + }, + { + "epoch": 4.810457516339869, + "grad_norm": 0.7240816354751587, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 7360 + }, + { + "epoch": 4.816993464052287, + "grad_norm": 0.8265769481658936, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 7370 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 0.8888696432113647, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 7380 + }, + { + "epoch": 4.830065359477124, + "grad_norm": 0.7776556015014648, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 7390 + }, + { + "epoch": 4.836601307189542, + "grad_norm": 0.8772371411323547, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7400 + }, + { + "epoch": 4.8431372549019605, + "grad_norm": 0.9786531925201416, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7410 + }, + { + "epoch": 4.849673202614379, + "grad_norm": 0.9059745073318481, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7420 + }, + { + "epoch": 4.856209150326797, + "grad_norm": 0.7422552108764648, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 7430 + }, + { + "epoch": 4.862745098039216, + "grad_norm": 1.3040380477905273, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 7440 + }, + { + "epoch": 4.8692810457516345, + "grad_norm": 1.3278473615646362, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 7450 + }, + { + "epoch": 4.875816993464053, + "grad_norm": 1.2705849409103394, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 7460 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 0.8837892413139343, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 7470 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8670691251754761, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 7480 + }, + { + "epoch": 4.895424836601308, + "grad_norm": 0.9662758111953735, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 7490 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 0.8188302516937256, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 7500 + }, + { + "epoch": 4.908496732026144, + "grad_norm": 0.769442617893219, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 7510 + }, + { + "epoch": 4.915032679738562, + "grad_norm": 1.1465084552764893, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 7520 + }, + { + "epoch": 4.921568627450981, + "grad_norm": 1.253214955329895, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 7530 + }, + { + "epoch": 4.928104575163399, + "grad_norm": 0.7922375202178955, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 7540 + }, + { + "epoch": 4.934640522875817, + "grad_norm": 0.8306851387023926, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7550 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.8486151099205017, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 7560 + }, + { + "epoch": 4.947712418300654, + "grad_norm": 1.2601467370986938, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7570 + }, + { + "epoch": 4.954248366013072, + "grad_norm": 0.7980747818946838, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 7580 + }, + { + "epoch": 4.96078431372549, + "grad_norm": 0.8653254508972168, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 7590 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 0.9680571556091309, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 7600 + }, + { + "epoch": 4.973856209150327, + "grad_norm": 0.9554466605186462, + "learning_rate": 0.0002, + "loss": 1.1795, + "step": 7610 + }, + { + "epoch": 4.980392156862745, + "grad_norm": 1.3693897724151611, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 7620 + }, + { + "epoch": 4.9869281045751634, + "grad_norm": 0.7809282541275024, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 7630 + }, + { + "epoch": 4.993464052287582, + "grad_norm": 0.7528006434440613, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 7640 + }, + { + "epoch": 5.0, + "grad_norm": 1.7491309642791748, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 7650 + }, + { + "epoch": 5.0, + "eval_loss": 1.4197258949279785, + "eval_runtime": 33.6327, + "eval_samples_per_second": 12.964, + "eval_steps_per_second": 1.635, + "step": 7650 + }, + { + "epoch": 5.006535947712418, + "grad_norm": 0.8840063214302063, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 7660 + }, + { + "epoch": 5.0130718954248366, + "grad_norm": 1.0118401050567627, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 7670 + }, + { + "epoch": 5.019607843137255, + "grad_norm": 1.0040518045425415, + "learning_rate": 0.0002, + "loss": 1.1667, + "step": 7680 + }, + { + "epoch": 5.026143790849673, + "grad_norm": 0.7541199922561646, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 7690 + }, + { + "epoch": 5.032679738562091, + "grad_norm": 0.9106482863426208, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 7700 + }, + { + "epoch": 5.03921568627451, + "grad_norm": 1.3691469430923462, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 7710 + }, + { + "epoch": 5.045751633986928, + "grad_norm": 0.9449689388275146, + "learning_rate": 0.0002, + "loss": 0.9889, + "step": 7720 + }, + { + "epoch": 5.052287581699346, + "grad_norm": 1.1678508520126343, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 7730 + }, + { + "epoch": 5.0588235294117645, + "grad_norm": 1.1296145915985107, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7740 + }, + { + "epoch": 5.065359477124183, + "grad_norm": 0.7863904237747192, + "learning_rate": 0.0002, + "loss": 0.9339, + "step": 7750 + }, + { + "epoch": 5.071895424836601, + "grad_norm": 0.8691433072090149, + "learning_rate": 0.0002, + "loss": 1.0135, + "step": 7760 + }, + { + "epoch": 5.078431372549019, + "grad_norm": 1.0722088813781738, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 7770 + }, + { + "epoch": 5.084967320261438, + "grad_norm": 0.9625038504600525, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 7780 + }, + { + "epoch": 5.091503267973856, + "grad_norm": 1.2618783712387085, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 7790 + }, + { + "epoch": 5.098039215686274, + "grad_norm": 0.9970650672912598, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 7800 + }, + { + "epoch": 5.104575163398692, + "grad_norm": 1.3946677446365356, + "learning_rate": 0.0002, + "loss": 0.9186, + "step": 7810 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.0260052680969238, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 7820 + }, + { + "epoch": 5.117647058823529, + "grad_norm": 1.105521559715271, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 7830 + }, + { + "epoch": 5.124183006535947, + "grad_norm": 1.003641128540039, + "learning_rate": 0.0002, + "loss": 0.9788, + "step": 7840 + }, + { + "epoch": 5.130718954248366, + "grad_norm": 1.0315021276474, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 7850 + }, + { + "epoch": 5.137254901960785, + "grad_norm": 0.9469530582427979, + "learning_rate": 0.0002, + "loss": 1.0001, + "step": 7860 + }, + { + "epoch": 5.143790849673203, + "grad_norm": 1.3244667053222656, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7870 + }, + { + "epoch": 5.150326797385621, + "grad_norm": 1.1732033491134644, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 7880 + }, + { + "epoch": 5.1568627450980395, + "grad_norm": 1.3129149675369263, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 7890 + }, + { + "epoch": 5.163398692810458, + "grad_norm": 0.8589454293251038, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7900 + }, + { + "epoch": 5.169934640522876, + "grad_norm": 0.8954233527183533, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7910 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 0.7426522970199585, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 7920 + }, + { + "epoch": 5.183006535947713, + "grad_norm": 1.1990121603012085, + "learning_rate": 0.0002, + "loss": 1.0106, + "step": 7930 + }, + { + "epoch": 5.189542483660131, + "grad_norm": 0.8867580890655518, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 7940 + }, + { + "epoch": 5.196078431372549, + "grad_norm": 1.016276478767395, + "learning_rate": 0.0002, + "loss": 0.9727, + "step": 7950 + }, + { + "epoch": 5.2026143790849675, + "grad_norm": 1.0210685729980469, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 7960 + }, + { + "epoch": 5.209150326797386, + "grad_norm": 1.0093122720718384, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7970 + }, + { + "epoch": 5.215686274509804, + "grad_norm": 0.9746801853179932, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 7980 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.9113537073135376, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 7990 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 1.2782206535339355, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 8000 + }, + { + "epoch": 5.235294117647059, + "grad_norm": 1.3223118782043457, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 8010 + }, + { + "epoch": 5.241830065359477, + "grad_norm": 0.7898629307746887, + "learning_rate": 0.0002, + "loss": 0.9244, + "step": 8020 + }, + { + "epoch": 5.248366013071895, + "grad_norm": 0.9822350740432739, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 8030 + }, + { + "epoch": 5.254901960784314, + "grad_norm": 1.5114340782165527, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 8040 + }, + { + "epoch": 5.261437908496732, + "grad_norm": 0.859006941318512, + "learning_rate": 0.0002, + "loss": 0.9816, + "step": 8050 + }, + { + "epoch": 5.26797385620915, + "grad_norm": 1.0495043992996216, + "learning_rate": 0.0002, + "loss": 0.9445, + "step": 8060 + }, + { + "epoch": 5.2745098039215685, + "grad_norm": 1.329483151435852, + "learning_rate": 0.0002, + "loss": 0.9724, + "step": 8070 + }, + { + "epoch": 5.281045751633987, + "grad_norm": 1.1333061456680298, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 8080 + }, + { + "epoch": 5.287581699346405, + "grad_norm": 0.8153108358383179, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 8090 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 0.9395004510879517, + "learning_rate": 0.0002, + "loss": 0.9002, + "step": 8100 + }, + { + "epoch": 5.300653594771242, + "grad_norm": 0.8907593488693237, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 8110 + }, + { + "epoch": 5.30718954248366, + "grad_norm": 0.9808667898178101, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 8120 + }, + { + "epoch": 5.313725490196078, + "grad_norm": 0.984779417514801, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 8130 + }, + { + "epoch": 5.3202614379084965, + "grad_norm": 0.9787270426750183, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 8140 + }, + { + "epoch": 5.326797385620915, + "grad_norm": 0.9857710599899292, + "learning_rate": 0.0002, + "loss": 0.9336, + "step": 8150 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9774303436279297, + "learning_rate": 0.0002, + "loss": 0.9884, + "step": 8160 + }, + { + "epoch": 5.339869281045751, + "grad_norm": 0.677925169467926, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 8170 + }, + { + "epoch": 5.34640522875817, + "grad_norm": 0.9576456546783447, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 8180 + }, + { + "epoch": 5.352941176470588, + "grad_norm": 1.8970937728881836, + "learning_rate": 0.0002, + "loss": 0.9554, + "step": 8190 + }, + { + "epoch": 5.359477124183006, + "grad_norm": 0.9458389282226562, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 8200 + }, + { + "epoch": 5.366013071895424, + "grad_norm": 1.761794924736023, + "learning_rate": 0.0002, + "loss": 1.0365, + "step": 8210 + }, + { + "epoch": 5.372549019607844, + "grad_norm": 1.0693724155426025, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 8220 + }, + { + "epoch": 5.379084967320262, + "grad_norm": 0.9025877714157104, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 8230 + }, + { + "epoch": 5.38562091503268, + "grad_norm": 1.258857250213623, + "learning_rate": 0.0002, + "loss": 0.9652, + "step": 8240 + }, + { + "epoch": 5.392156862745098, + "grad_norm": 1.084849238395691, + "learning_rate": 0.0002, + "loss": 0.9735, + "step": 8250 + }, + { + "epoch": 5.398692810457517, + "grad_norm": 0.9530340433120728, + "learning_rate": 0.0002, + "loss": 0.9999, + "step": 8260 + }, + { + "epoch": 5.405228758169935, + "grad_norm": 0.830240786075592, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 8270 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 1.5807015895843506, + "learning_rate": 0.0002, + "loss": 1.0332, + "step": 8280 + }, + { + "epoch": 5.4183006535947715, + "grad_norm": 0.9486905336380005, + "learning_rate": 0.0002, + "loss": 0.9146, + "step": 8290 + }, + { + "epoch": 5.42483660130719, + "grad_norm": 1.0415093898773193, + "learning_rate": 0.0002, + "loss": 1.0336, + "step": 8300 + }, + { + "epoch": 5.431372549019608, + "grad_norm": 1.0501102209091187, + "learning_rate": 0.0002, + "loss": 0.8933, + "step": 8310 + }, + { + "epoch": 5.437908496732026, + "grad_norm": 0.9751836061477661, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 8320 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.5529173612594604, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 8330 + }, + { + "epoch": 5.450980392156863, + "grad_norm": 0.8314350247383118, + "learning_rate": 0.0002, + "loss": 0.9814, + "step": 8340 + }, + { + "epoch": 5.457516339869281, + "grad_norm": 1.2555103302001953, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 8350 + }, + { + "epoch": 5.4640522875816995, + "grad_norm": 0.9408367872238159, + "learning_rate": 0.0002, + "loss": 1.0127, + "step": 8360 + }, + { + "epoch": 5.470588235294118, + "grad_norm": 0.9483312964439392, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 8370 + }, + { + "epoch": 5.477124183006536, + "grad_norm": 0.957905650138855, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8380 + }, + { + "epoch": 5.483660130718954, + "grad_norm": 1.4000147581100464, + "learning_rate": 0.0002, + "loss": 1.0985, + "step": 8390 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 1.7032461166381836, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 8400 + }, + { + "epoch": 5.496732026143791, + "grad_norm": 0.8978716731071472, + "learning_rate": 0.0002, + "loss": 0.9539, + "step": 8410 + }, + { + "epoch": 5.503267973856209, + "grad_norm": 0.8659300804138184, + "learning_rate": 0.0002, + "loss": 0.9544, + "step": 8420 + }, + { + "epoch": 5.509803921568627, + "grad_norm": 1.3629727363586426, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 8430 + }, + { + "epoch": 5.516339869281046, + "grad_norm": 1.2741984128952026, + "learning_rate": 0.0002, + "loss": 0.9696, + "step": 8440 + }, + { + "epoch": 5.522875816993464, + "grad_norm": 1.3867180347442627, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 8450 + }, + { + "epoch": 5.529411764705882, + "grad_norm": 1.0662001371383667, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 8460 + }, + { + "epoch": 5.5359477124183005, + "grad_norm": 1.7005380392074585, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 8470 + }, + { + "epoch": 5.542483660130719, + "grad_norm": 1.3730385303497314, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 8480 + }, + { + "epoch": 5.549019607843137, + "grad_norm": 1.7737441062927246, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 8490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.907487690448761, + "learning_rate": 0.0002, + "loss": 0.9729, + "step": 8500 + }, + { + "epoch": 5.562091503267974, + "grad_norm": 0.8882441520690918, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 8510 + }, + { + "epoch": 5.568627450980392, + "grad_norm": 0.8655388951301575, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 8520 + }, + { + "epoch": 5.57516339869281, + "grad_norm": 1.379992961883545, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 8530 + }, + { + "epoch": 5.5816993464052285, + "grad_norm": 1.0021201372146606, + "learning_rate": 0.0002, + "loss": 1.0174, + "step": 8540 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 1.2636926174163818, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 8550 + }, + { + "epoch": 5.594771241830065, + "grad_norm": 1.279025912284851, + "learning_rate": 0.0002, + "loss": 1.0243, + "step": 8560 + }, + { + "epoch": 5.601307189542483, + "grad_norm": 0.8885834217071533, + "learning_rate": 0.0002, + "loss": 0.9917, + "step": 8570 + }, + { + "epoch": 5.607843137254902, + "grad_norm": 1.1975032091140747, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 8580 + }, + { + "epoch": 5.61437908496732, + "grad_norm": 1.005470871925354, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8590 + }, + { + "epoch": 5.620915032679738, + "grad_norm": 1.104286551475525, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 8600 + }, + { + "epoch": 5.627450980392156, + "grad_norm": 1.435445785522461, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 8610 + }, + { + "epoch": 5.633986928104575, + "grad_norm": 1.0270172357559204, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 8620 + }, + { + "epoch": 5.640522875816993, + "grad_norm": 1.0929527282714844, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 8630 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 1.1061221361160278, + "learning_rate": 0.0002, + "loss": 0.9694, + "step": 8640 + }, + { + "epoch": 5.65359477124183, + "grad_norm": 0.9563149213790894, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8650 + }, + { + "epoch": 5.660130718954249, + "grad_norm": 1.0434954166412354, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 8660 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.3695117235183716, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 8670 + }, + { + "epoch": 5.673202614379085, + "grad_norm": 1.0540564060211182, + "learning_rate": 0.0002, + "loss": 0.9441, + "step": 8680 + }, + { + "epoch": 5.6797385620915035, + "grad_norm": 1.5942492485046387, + "learning_rate": 0.0002, + "loss": 0.9755, + "step": 8690 + }, + { + "epoch": 5.686274509803922, + "grad_norm": 0.9485495090484619, + "learning_rate": 0.0002, + "loss": 1.0071, + "step": 8700 + }, + { + "epoch": 5.69281045751634, + "grad_norm": 1.1483162641525269, + "learning_rate": 0.0002, + "loss": 0.9998, + "step": 8710 + }, + { + "epoch": 5.699346405228758, + "grad_norm": 0.9075471758842468, + "learning_rate": 0.0002, + "loss": 0.9578, + "step": 8720 + }, + { + "epoch": 5.705882352941177, + "grad_norm": 1.7908551692962646, + "learning_rate": 0.0002, + "loss": 0.9488, + "step": 8730 + }, + { + "epoch": 5.712418300653595, + "grad_norm": 0.8867162466049194, + "learning_rate": 0.0002, + "loss": 1.0163, + "step": 8740 + }, + { + "epoch": 5.718954248366013, + "grad_norm": 1.7165148258209229, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 8750 + }, + { + "epoch": 5.7254901960784315, + "grad_norm": 0.9529356956481934, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 8760 + }, + { + "epoch": 5.73202614379085, + "grad_norm": 1.01852548122406, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 8770 + }, + { + "epoch": 5.738562091503268, + "grad_norm": 0.9538423418998718, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 8780 + }, + { + "epoch": 5.745098039215686, + "grad_norm": 0.9007737636566162, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 8790 + }, + { + "epoch": 5.751633986928105, + "grad_norm": 0.9107874035835266, + "learning_rate": 0.0002, + "loss": 0.9766, + "step": 8800 + }, + { + "epoch": 5.758169934640523, + "grad_norm": 0.7379238605499268, + "learning_rate": 0.0002, + "loss": 0.9212, + "step": 8810 + }, + { + "epoch": 5.764705882352941, + "grad_norm": 1.072645902633667, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 8820 + }, + { + "epoch": 5.771241830065359, + "grad_norm": 1.002008080482483, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 8830 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 1.0435924530029297, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 8840 + }, + { + "epoch": 5.784313725490196, + "grad_norm": 0.9874551296234131, + "learning_rate": 0.0002, + "loss": 0.9458, + "step": 8850 + }, + { + "epoch": 5.790849673202614, + "grad_norm": 1.1729662418365479, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 8860 + }, + { + "epoch": 5.7973856209150325, + "grad_norm": 1.3300775289535522, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 8870 + }, + { + "epoch": 5.803921568627451, + "grad_norm": 1.612707257270813, + "learning_rate": 0.0002, + "loss": 1.0989, + "step": 8880 + }, + { + "epoch": 5.810457516339869, + "grad_norm": 0.9047797322273254, + "learning_rate": 0.0002, + "loss": 0.9119, + "step": 8890 + }, + { + "epoch": 5.816993464052287, + "grad_norm": 1.0958741903305054, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 8900 + }, + { + "epoch": 5.823529411764706, + "grad_norm": 1.0099612474441528, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 8910 + }, + { + "epoch": 5.830065359477124, + "grad_norm": 0.8442328572273254, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 8920 + }, + { + "epoch": 5.836601307189542, + "grad_norm": 1.1388301849365234, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 8930 + }, + { + "epoch": 5.8431372549019605, + "grad_norm": 0.8296026587486267, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 8940 + }, + { + "epoch": 5.849673202614379, + "grad_norm": 1.0843533277511597, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8950 + }, + { + "epoch": 5.856209150326797, + "grad_norm": 0.8496834635734558, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 8960 + }, + { + "epoch": 5.862745098039216, + "grad_norm": 1.6894690990447998, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 8970 + }, + { + "epoch": 5.8692810457516345, + "grad_norm": 1.0012282133102417, + "learning_rate": 0.0002, + "loss": 1.0939, + "step": 8980 + }, + { + "epoch": 5.875816993464053, + "grad_norm": 0.8521103262901306, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 8990 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 1.246841311454773, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 9000 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9941892027854919, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 9010 + }, + { + "epoch": 5.895424836601308, + "grad_norm": 1.067413568496704, + "learning_rate": 0.0002, + "loss": 0.8754, + "step": 9020 + }, + { + "epoch": 5.901960784313726, + "grad_norm": 1.0045088529586792, + "learning_rate": 0.0002, + "loss": 1.0153, + "step": 9030 + }, + { + "epoch": 5.908496732026144, + "grad_norm": 1.383063554763794, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 9040 + }, + { + "epoch": 5.915032679738562, + "grad_norm": 0.8754428625106812, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 9050 + }, + { + "epoch": 5.921568627450981, + "grad_norm": 0.8577388525009155, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 9060 + }, + { + "epoch": 5.928104575163399, + "grad_norm": 0.8718975186347961, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 9070 + }, + { + "epoch": 5.934640522875817, + "grad_norm": 1.1762131452560425, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 9080 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 1.1025866270065308, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 9090 + }, + { + "epoch": 5.947712418300654, + "grad_norm": 1.0439870357513428, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 9100 + }, + { + "epoch": 5.954248366013072, + "grad_norm": 1.2411525249481201, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 9110 + }, + { + "epoch": 5.96078431372549, + "grad_norm": 1.0317714214324951, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 9120 + }, + { + "epoch": 5.967320261437909, + "grad_norm": 0.9880492091178894, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 9130 + }, + { + "epoch": 5.973856209150327, + "grad_norm": 0.9039815664291382, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 9140 + }, + { + "epoch": 5.980392156862745, + "grad_norm": 0.9049116373062134, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 9150 + }, + { + "epoch": 5.9869281045751634, + "grad_norm": 0.996749222278595, + "learning_rate": 0.0002, + "loss": 0.9792, + "step": 9160 + }, + { + "epoch": 5.993464052287582, + "grad_norm": 0.8716062307357788, + "learning_rate": 0.0002, + "loss": 0.8857, + "step": 9170 + }, + { + "epoch": 6.0, + "grad_norm": 1.3081294298171997, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 9180 + }, + { + "epoch": 6.0, + "eval_loss": 1.45111083984375, + "eval_runtime": 34.7121, + "eval_samples_per_second": 12.56, + "eval_steps_per_second": 1.584, + "step": 9180 + }, + { + "epoch": 6.006535947712418, + "grad_norm": 1.1378029584884644, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 9190 + }, + { + "epoch": 6.0130718954248366, + "grad_norm": 1.2921233177185059, + "learning_rate": 0.0002, + "loss": 0.8794, + "step": 9200 + }, + { + "epoch": 6.019607843137255, + "grad_norm": 1.039211630821228, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 9210 + }, + { + "epoch": 6.026143790849673, + "grad_norm": 0.9715196490287781, + "learning_rate": 0.0002, + "loss": 0.8524, + "step": 9220 + }, + { + "epoch": 6.032679738562091, + "grad_norm": 1.220642328262329, + "learning_rate": 0.0002, + "loss": 1.035, + "step": 9230 + }, + { + "epoch": 6.03921568627451, + "grad_norm": 0.854360044002533, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 9240 + }, + { + "epoch": 6.045751633986928, + "grad_norm": 0.8806933164596558, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 9250 + }, + { + "epoch": 6.052287581699346, + "grad_norm": 1.4315874576568604, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 9260 + }, + { + "epoch": 6.0588235294117645, + "grad_norm": 0.9382007122039795, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 9270 + }, + { + "epoch": 6.065359477124183, + "grad_norm": 1.2184561491012573, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 9280 + }, + { + "epoch": 6.071895424836601, + "grad_norm": 1.2331548929214478, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 9290 + }, + { + "epoch": 6.078431372549019, + "grad_norm": 1.1112796068191528, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 9300 + }, + { + "epoch": 6.084967320261438, + "grad_norm": 1.4753731489181519, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 9310 + }, + { + "epoch": 6.091503267973856, + "grad_norm": 1.2783401012420654, + "learning_rate": 0.0002, + "loss": 0.9198, + "step": 9320 + }, + { + "epoch": 6.098039215686274, + "grad_norm": 0.9916909337043762, + "learning_rate": 0.0002, + "loss": 0.8294, + "step": 9330 + }, + { + "epoch": 6.104575163398692, + "grad_norm": 0.9300099015235901, + "learning_rate": 0.0002, + "loss": 0.876, + "step": 9340 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 1.4985264539718628, + "learning_rate": 0.0002, + "loss": 0.9064, + "step": 9350 + }, + { + "epoch": 6.117647058823529, + "grad_norm": 1.276380181312561, + "learning_rate": 0.0002, + "loss": 1.0106, + "step": 9360 + }, + { + "epoch": 6.124183006535947, + "grad_norm": 1.181113600730896, + "learning_rate": 0.0002, + "loss": 0.9068, + "step": 9370 + }, + { + "epoch": 6.130718954248366, + "grad_norm": 1.698729395866394, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 9380 + }, + { + "epoch": 6.137254901960785, + "grad_norm": 0.9793189764022827, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 9390 + }, + { + "epoch": 6.143790849673203, + "grad_norm": 1.1942132711410522, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 9400 + }, + { + "epoch": 6.150326797385621, + "grad_norm": 1.2160184383392334, + "learning_rate": 0.0002, + "loss": 0.8762, + "step": 9410 + }, + { + "epoch": 6.1568627450980395, + "grad_norm": 1.0802825689315796, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 9420 + }, + { + "epoch": 6.163398692810458, + "grad_norm": 3.024529218673706, + "learning_rate": 0.0002, + "loss": 0.9055, + "step": 9430 + }, + { + "epoch": 6.169934640522876, + "grad_norm": 0.975062370300293, + "learning_rate": 0.0002, + "loss": 0.8739, + "step": 9440 + }, + { + "epoch": 6.176470588235294, + "grad_norm": 0.9243306517601013, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 9450 + }, + { + "epoch": 6.183006535947713, + "grad_norm": 0.8892099857330322, + "learning_rate": 0.0002, + "loss": 0.947, + "step": 9460 + }, + { + "epoch": 6.189542483660131, + "grad_norm": 1.4151731729507446, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 9470 + }, + { + "epoch": 6.196078431372549, + "grad_norm": 1.064701795578003, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 9480 + }, + { + "epoch": 6.2026143790849675, + "grad_norm": 1.1104519367218018, + "learning_rate": 0.0002, + "loss": 0.906, + "step": 9490 + }, + { + "epoch": 6.209150326797386, + "grad_norm": 1.4788947105407715, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 9500 + }, + { + "epoch": 6.215686274509804, + "grad_norm": 0.7976077795028687, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 9510 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.256864070892334, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 9520 + }, + { + "epoch": 6.228758169934641, + "grad_norm": 1.3874554634094238, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 9530 + }, + { + "epoch": 6.235294117647059, + "grad_norm": 1.9012963771820068, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 9540 + }, + { + "epoch": 6.241830065359477, + "grad_norm": 1.275212287902832, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 9550 + }, + { + "epoch": 6.248366013071895, + "grad_norm": 1.1007417440414429, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 9560 + }, + { + "epoch": 6.254901960784314, + "grad_norm": 1.0602147579193115, + "learning_rate": 0.0002, + "loss": 0.9191, + "step": 9570 + }, + { + "epoch": 6.261437908496732, + "grad_norm": 1.2276418209075928, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 9580 + }, + { + "epoch": 6.26797385620915, + "grad_norm": 1.0111924409866333, + "learning_rate": 0.0002, + "loss": 0.9363, + "step": 9590 + }, + { + "epoch": 6.2745098039215685, + "grad_norm": 0.9031485915184021, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 9600 + }, + { + "epoch": 6.281045751633987, + "grad_norm": 0.9893783926963806, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 9610 + }, + { + "epoch": 6.287581699346405, + "grad_norm": 1.1979725360870361, + "learning_rate": 0.0002, + "loss": 0.9114, + "step": 9620 + }, + { + "epoch": 6.294117647058823, + "grad_norm": 1.380516767501831, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 9630 + }, + { + "epoch": 6.300653594771242, + "grad_norm": 1.1370083093643188, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 9640 + }, + { + "epoch": 6.30718954248366, + "grad_norm": 1.4091558456420898, + "learning_rate": 0.0002, + "loss": 0.9073, + "step": 9650 + }, + { + "epoch": 6.313725490196078, + "grad_norm": 1.0670944452285767, + "learning_rate": 0.0002, + "loss": 0.9096, + "step": 9660 + }, + { + "epoch": 6.3202614379084965, + "grad_norm": 0.9150263667106628, + "learning_rate": 0.0002, + "loss": 0.9376, + "step": 9670 + }, + { + "epoch": 6.326797385620915, + "grad_norm": 1.1342853307724, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 9680 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 1.2733415365219116, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 9690 + }, + { + "epoch": 6.339869281045751, + "grad_norm": 1.3647292852401733, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 9700 + }, + { + "epoch": 6.34640522875817, + "grad_norm": 1.0435094833374023, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 9710 + }, + { + "epoch": 6.352941176470588, + "grad_norm": 1.3641071319580078, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 9720 + }, + { + "epoch": 6.359477124183006, + "grad_norm": 1.2806159257888794, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 9730 + }, + { + "epoch": 6.366013071895424, + "grad_norm": 1.0193076133728027, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 9740 + }, + { + "epoch": 6.372549019607844, + "grad_norm": 1.2349408864974976, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 9750 + }, + { + "epoch": 6.379084967320262, + "grad_norm": 1.2062549591064453, + "learning_rate": 0.0002, + "loss": 0.8837, + "step": 9760 + }, + { + "epoch": 6.38562091503268, + "grad_norm": 1.4402194023132324, + "learning_rate": 0.0002, + "loss": 0.8947, + "step": 9770 + }, + { + "epoch": 6.392156862745098, + "grad_norm": 1.1730891466140747, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 9780 + }, + { + "epoch": 6.398692810457517, + "grad_norm": 1.1481093168258667, + "learning_rate": 0.0002, + "loss": 0.9005, + "step": 9790 + }, + { + "epoch": 6.405228758169935, + "grad_norm": 1.0012723207473755, + "learning_rate": 0.0002, + "loss": 0.9431, + "step": 9800 + }, + { + "epoch": 6.411764705882353, + "grad_norm": 0.8839848041534424, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 9810 + }, + { + "epoch": 6.4183006535947715, + "grad_norm": 1.096693992614746, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 9820 + }, + { + "epoch": 6.42483660130719, + "grad_norm": 1.4713369607925415, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 9830 + }, + { + "epoch": 6.431372549019608, + "grad_norm": 1.2529761791229248, + "learning_rate": 0.0002, + "loss": 0.9563, + "step": 9840 + }, + { + "epoch": 6.437908496732026, + "grad_norm": 1.5575600862503052, + "learning_rate": 0.0002, + "loss": 0.8551, + "step": 9850 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.2188916206359863, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 9860 + }, + { + "epoch": 6.450980392156863, + "grad_norm": 1.1558794975280762, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 9870 + }, + { + "epoch": 6.457516339869281, + "grad_norm": 1.1506937742233276, + "learning_rate": 0.0002, + "loss": 0.8632, + "step": 9880 + }, + { + "epoch": 6.4640522875816995, + "grad_norm": 1.1168335676193237, + "learning_rate": 0.0002, + "loss": 1.0575, + "step": 9890 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 1.192449688911438, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 9900 + }, + { + "epoch": 6.477124183006536, + "grad_norm": 1.0451104640960693, + "learning_rate": 0.0002, + "loss": 0.9478, + "step": 9910 + }, + { + "epoch": 6.483660130718954, + "grad_norm": 1.1111775636672974, + "learning_rate": 0.0002, + "loss": 0.9034, + "step": 9920 + }, + { + "epoch": 6.490196078431373, + "grad_norm": 1.2094531059265137, + "learning_rate": 0.0002, + "loss": 0.8971, + "step": 9930 + }, + { + "epoch": 6.496732026143791, + "grad_norm": 1.0547380447387695, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 9940 + }, + { + "epoch": 6.503267973856209, + "grad_norm": 1.5547202825546265, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 9950 + }, + { + "epoch": 6.509803921568627, + "grad_norm": 1.1917903423309326, + "learning_rate": 0.0002, + "loss": 0.9109, + "step": 9960 + }, + { + "epoch": 6.516339869281046, + "grad_norm": 1.0918153524398804, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 9970 + }, + { + "epoch": 6.522875816993464, + "grad_norm": 1.146968960762024, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 9980 + }, + { + "epoch": 6.529411764705882, + "grad_norm": 0.9899234771728516, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 9990 + }, + { + "epoch": 6.5359477124183005, + "grad_norm": 2.160924196243286, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 10000 + }, + { + "epoch": 6.542483660130719, + "grad_norm": 1.6366891860961914, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 10010 + }, + { + "epoch": 6.549019607843137, + "grad_norm": 0.9876762628555298, + "learning_rate": 0.0002, + "loss": 0.8582, + "step": 10020 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.5622549057006836, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 10030 + }, + { + "epoch": 6.562091503267974, + "grad_norm": 1.0108020305633545, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 10040 + }, + { + "epoch": 6.568627450980392, + "grad_norm": 1.0725725889205933, + "learning_rate": 0.0002, + "loss": 0.9574, + "step": 10050 + }, + { + "epoch": 6.57516339869281, + "grad_norm": 1.1551216840744019, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 10060 + }, + { + "epoch": 6.5816993464052285, + "grad_norm": 1.5174646377563477, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 10070 + }, + { + "epoch": 6.588235294117647, + "grad_norm": 1.041877031326294, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 10080 + }, + { + "epoch": 6.594771241830065, + "grad_norm": 0.9939621686935425, + "learning_rate": 0.0002, + "loss": 0.9684, + "step": 10090 + }, + { + "epoch": 6.601307189542483, + "grad_norm": 1.2706589698791504, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 10100 + }, + { + "epoch": 6.607843137254902, + "grad_norm": 1.1071467399597168, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 10110 + }, + { + "epoch": 6.61437908496732, + "grad_norm": 0.9449541568756104, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 10120 + }, + { + "epoch": 6.620915032679738, + "grad_norm": 1.0961830615997314, + "learning_rate": 0.0002, + "loss": 0.9557, + "step": 10130 + }, + { + "epoch": 6.627450980392156, + "grad_norm": 1.7726300954818726, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 10140 + }, + { + "epoch": 6.633986928104575, + "grad_norm": 1.2345516681671143, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 10150 + }, + { + "epoch": 6.640522875816993, + "grad_norm": 1.2062907218933105, + "learning_rate": 0.0002, + "loss": 0.9573, + "step": 10160 + }, + { + "epoch": 6.647058823529412, + "grad_norm": 1.029327154159546, + "learning_rate": 0.0002, + "loss": 0.918, + "step": 10170 + }, + { + "epoch": 6.65359477124183, + "grad_norm": 1.442307710647583, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 10180 + }, + { + "epoch": 6.660130718954249, + "grad_norm": 1.2579066753387451, + "learning_rate": 0.0002, + "loss": 0.8924, + "step": 10190 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.4563188552856445, + "learning_rate": 0.0002, + "loss": 0.9836, + "step": 10200 + }, + { + "epoch": 6.673202614379085, + "grad_norm": 0.9699450135231018, + "learning_rate": 0.0002, + "loss": 0.8876, + "step": 10210 + }, + { + "epoch": 6.6797385620915035, + "grad_norm": 1.812523603439331, + "learning_rate": 0.0002, + "loss": 0.9589, + "step": 10220 + }, + { + "epoch": 6.686274509803922, + "grad_norm": 1.124000906944275, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 10230 + }, + { + "epoch": 6.69281045751634, + "grad_norm": 1.0957475900650024, + "learning_rate": 0.0002, + "loss": 0.8924, + "step": 10240 + }, + { + "epoch": 6.699346405228758, + "grad_norm": 0.989689826965332, + "learning_rate": 0.0002, + "loss": 0.8891, + "step": 10250 + }, + { + "epoch": 6.705882352941177, + "grad_norm": 1.4353317022323608, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 10260 + }, + { + "epoch": 6.712418300653595, + "grad_norm": 1.0245451927185059, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 10270 + }, + { + "epoch": 6.718954248366013, + "grad_norm": 1.097334861755371, + "learning_rate": 0.0002, + "loss": 0.8814, + "step": 10280 + }, + { + "epoch": 6.7254901960784315, + "grad_norm": 0.982356071472168, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 10290 + }, + { + "epoch": 6.73202614379085, + "grad_norm": 1.8842819929122925, + "learning_rate": 0.0002, + "loss": 0.9909, + "step": 10300 + }, + { + "epoch": 6.738562091503268, + "grad_norm": 0.8648947477340698, + "learning_rate": 0.0002, + "loss": 0.9286, + "step": 10310 + }, + { + "epoch": 6.745098039215686, + "grad_norm": 1.1510577201843262, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 10320 + }, + { + "epoch": 6.751633986928105, + "grad_norm": 1.874495506286621, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 10330 + }, + { + "epoch": 6.758169934640523, + "grad_norm": 1.1126408576965332, + "learning_rate": 0.0002, + "loss": 0.8914, + "step": 10340 + }, + { + "epoch": 6.764705882352941, + "grad_norm": 1.6654644012451172, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 10350 + }, + { + "epoch": 6.771241830065359, + "grad_norm": 1.0699580907821655, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 10360 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.9460757374763489, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 10370 + }, + { + "epoch": 6.784313725490196, + "grad_norm": 1.2553058862686157, + "learning_rate": 0.0002, + "loss": 0.9589, + "step": 10380 + }, + { + "epoch": 6.790849673202614, + "grad_norm": 1.0939891338348389, + "learning_rate": 0.0002, + "loss": 0.8782, + "step": 10390 + }, + { + "epoch": 6.7973856209150325, + "grad_norm": 1.0647451877593994, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 10400 + }, + { + "epoch": 6.803921568627451, + "grad_norm": 1.0954521894454956, + "learning_rate": 0.0002, + "loss": 0.9478, + "step": 10410 + }, + { + "epoch": 6.810457516339869, + "grad_norm": 1.4371392726898193, + "learning_rate": 0.0002, + "loss": 1.0385, + "step": 10420 + }, + { + "epoch": 6.816993464052287, + "grad_norm": 1.0063464641571045, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 10430 + }, + { + "epoch": 6.823529411764706, + "grad_norm": 1.5189263820648193, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 10440 + }, + { + "epoch": 6.830065359477124, + "grad_norm": 0.9715501070022583, + "learning_rate": 0.0002, + "loss": 0.9246, + "step": 10450 + }, + { + "epoch": 6.836601307189542, + "grad_norm": 1.114586353302002, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 10460 + }, + { + "epoch": 6.8431372549019605, + "grad_norm": 1.2991431951522827, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 10470 + }, + { + "epoch": 6.849673202614379, + "grad_norm": 1.203114628791809, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 10480 + }, + { + "epoch": 6.856209150326797, + "grad_norm": 1.476167917251587, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 10490 + }, + { + "epoch": 6.862745098039216, + "grad_norm": 1.0933326482772827, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 10500 + }, + { + "epoch": 6.8692810457516345, + "grad_norm": 1.2831504344940186, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 10510 + }, + { + "epoch": 6.875816993464053, + "grad_norm": 1.1967637538909912, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 10520 + }, + { + "epoch": 6.882352941176471, + "grad_norm": 1.1276888847351074, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 10530 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.2680490016937256, + "learning_rate": 0.0002, + "loss": 0.9568, + "step": 10540 + }, + { + "epoch": 6.895424836601308, + "grad_norm": 1.5469038486480713, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 10550 + }, + { + "epoch": 6.901960784313726, + "grad_norm": 1.1731038093566895, + "learning_rate": 0.0002, + "loss": 0.8545, + "step": 10560 + }, + { + "epoch": 6.908496732026144, + "grad_norm": 0.968008816242218, + "learning_rate": 0.0002, + "loss": 0.9795, + "step": 10570 + }, + { + "epoch": 6.915032679738562, + "grad_norm": 0.9082416892051697, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 10580 + }, + { + "epoch": 6.921568627450981, + "grad_norm": 1.5816899538040161, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 10590 + }, + { + "epoch": 6.928104575163399, + "grad_norm": 0.9462234377861023, + "learning_rate": 0.0002, + "loss": 0.9692, + "step": 10600 + }, + { + "epoch": 6.934640522875817, + "grad_norm": 1.4950200319290161, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 10610 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 1.2929182052612305, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 10620 + }, + { + "epoch": 6.947712418300654, + "grad_norm": 1.2995754480361938, + "learning_rate": 0.0002, + "loss": 1.0141, + "step": 10630 + }, + { + "epoch": 6.954248366013072, + "grad_norm": 0.9407122135162354, + "learning_rate": 0.0002, + "loss": 0.9863, + "step": 10640 + }, + { + "epoch": 6.96078431372549, + "grad_norm": 1.1735378503799438, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 10650 + }, + { + "epoch": 6.967320261437909, + "grad_norm": 0.9937344193458557, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 10660 + }, + { + "epoch": 6.973856209150327, + "grad_norm": 1.2498728036880493, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 10670 + }, + { + "epoch": 6.980392156862745, + "grad_norm": 1.0513341426849365, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 10680 + }, + { + "epoch": 6.9869281045751634, + "grad_norm": 1.4611467123031616, + "learning_rate": 0.0002, + "loss": 0.9259, + "step": 10690 + }, + { + "epoch": 6.993464052287582, + "grad_norm": 1.2924799919128418, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 10700 + }, + { + "epoch": 7.0, + "grad_norm": 1.2024929523468018, + "learning_rate": 0.0002, + "loss": 0.8953, + "step": 10710 + }, + { + "epoch": 7.0, + "eval_loss": 1.4972445964813232, + "eval_runtime": 33.6225, + "eval_samples_per_second": 12.967, + "eval_steps_per_second": 1.636, + "step": 10710 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3089758074896384e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-10710/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f3483670b3b00ac0ed25619f0eb0c0b7112637a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b24de5485ae306b5ee4c7c4a1602e656d518d50ee3702da6aaec99610e4485 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e54e9f92fac05bb989d1f470ae4f0352aef0a4b3 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde8adc9844a4c4ade5d56781e28a8b72ff6c6ae4f98411f82fe68cdd6a6e416 +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce4b2ba7fe1da609c7bfdda3f3ff828aa4fb8618 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f5fa33db12a535a6a8d2cad85866729a0401e18eeecab9c6ee76c55d5da392 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..093465e351511c3a482f0ece5eb88f5b5d01d841 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f3f90db975d017768edcd4c27b014f37a6b88072ed40497cd25be5a074c7da +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8d96ba531cd696f91804f09d3c1d441a9a723cd8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/trainer_state.json @@ -0,0 +1,8665 @@ +{ + "best_metric": 1.4113320112228394, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 12240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + }, + { + "epoch": 3.0065359477124183, + "grad_norm": 0.6926420331001282, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4600 + }, + { + "epoch": 3.0130718954248366, + "grad_norm": 0.6656355857849121, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 4610 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.9901936650276184, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 4620 + }, + { + "epoch": 3.026143790849673, + "grad_norm": 0.6713474988937378, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 4630 + }, + { + "epoch": 3.0326797385620914, + "grad_norm": 0.6199324131011963, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 4640 + }, + { + "epoch": 3.0392156862745097, + "grad_norm": 0.7180785536766052, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 4650 + }, + { + "epoch": 3.045751633986928, + "grad_norm": 0.8256588578224182, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 4660 + }, + { + "epoch": 3.052287581699346, + "grad_norm": 0.6637389063835144, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 4670 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.6980698108673096, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 4680 + }, + { + "epoch": 3.065359477124183, + "grad_norm": 0.8091534972190857, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 4690 + }, + { + "epoch": 3.0718954248366015, + "grad_norm": 0.5715174078941345, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 4700 + }, + { + "epoch": 3.0784313725490198, + "grad_norm": 0.735639750957489, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 4710 + }, + { + "epoch": 3.084967320261438, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.0002, + "loss": 1.1522, + "step": 4720 + }, + { + "epoch": 3.0915032679738563, + "grad_norm": 1.263566017150879, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 4730 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.6600871682167053, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4740 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 0.717792809009552, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 4750 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.853714644908905, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 4760 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.1004153490066528, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 4770 + }, + { + "epoch": 3.1241830065359477, + "grad_norm": 0.8566235899925232, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 4780 + }, + { + "epoch": 3.130718954248366, + "grad_norm": 0.8315296173095703, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 4790 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8020524978637695, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 4800 + }, + { + "epoch": 3.1437908496732025, + "grad_norm": 0.7564275860786438, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 4810 + }, + { + "epoch": 3.150326797385621, + "grad_norm": 0.9077776670455933, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 4820 + }, + { + "epoch": 3.156862745098039, + "grad_norm": 0.6323099732398987, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 4830 + }, + { + "epoch": 3.1633986928104574, + "grad_norm": 0.6625368595123291, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 3.1699346405228757, + "grad_norm": 0.8119261860847473, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 4850 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.6399450898170471, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 4860 + }, + { + "epoch": 3.183006535947712, + "grad_norm": 1.0659016370773315, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 4870 + }, + { + "epoch": 3.189542483660131, + "grad_norm": 0.8040369749069214, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4880 + }, + { + "epoch": 3.196078431372549, + "grad_norm": 0.7784733176231384, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4890 + }, + { + "epoch": 3.2026143790849675, + "grad_norm": 0.9660294651985168, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4900 + }, + { + "epoch": 3.2091503267973858, + "grad_norm": 1.0676977634429932, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 4910 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.5877565741539001, + "learning_rate": 0.0002, + "loss": 1.0083, + "step": 4920 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.6164032816886902, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 4930 + }, + { + "epoch": 3.2287581699346406, + "grad_norm": 0.7627606987953186, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 4940 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 0.7442803978919983, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 4950 + }, + { + "epoch": 3.241830065359477, + "grad_norm": 0.7277812361717224, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 4960 + }, + { + "epoch": 3.2483660130718954, + "grad_norm": 1.0301902294158936, + "learning_rate": 0.0002, + "loss": 1.2237, + "step": 4970 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.7798232436180115, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 4980 + }, + { + "epoch": 3.261437908496732, + "grad_norm": 1.210265874862671, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 4990 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 0.6677713990211487, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 5000 + }, + { + "epoch": 3.2745098039215685, + "grad_norm": 1.0524500608444214, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 5010 + }, + { + "epoch": 3.281045751633987, + "grad_norm": 0.7091745734214783, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5020 + }, + { + "epoch": 3.287581699346405, + "grad_norm": 0.8523224592208862, + "learning_rate": 0.0002, + "loss": 1.1891, + "step": 5030 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.6120608448982239, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 5040 + }, + { + "epoch": 3.3006535947712417, + "grad_norm": 0.7437472939491272, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 5050 + }, + { + "epoch": 3.30718954248366, + "grad_norm": 0.7611715197563171, + "learning_rate": 0.0002, + "loss": 1.1295, + "step": 5060 + }, + { + "epoch": 3.313725490196078, + "grad_norm": 0.7249704003334045, + "learning_rate": 0.0002, + "loss": 1.0531, + "step": 5070 + }, + { + "epoch": 3.3202614379084965, + "grad_norm": 0.7316247820854187, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5080 + }, + { + "epoch": 3.326797385620915, + "grad_norm": 0.562412440776825, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5090 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7052176594734192, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5100 + }, + { + "epoch": 3.3398692810457518, + "grad_norm": 0.7714211344718933, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 5110 + }, + { + "epoch": 3.34640522875817, + "grad_norm": 1.0436055660247803, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5120 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.8867271542549133, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 5130 + }, + { + "epoch": 3.3594771241830066, + "grad_norm": 0.8371267914772034, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 5140 + }, + { + "epoch": 3.366013071895425, + "grad_norm": 0.7257837057113647, + "learning_rate": 0.0002, + "loss": 1.1073, + "step": 5150 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.7102002501487732, + "learning_rate": 0.0002, + "loss": 1.1162, + "step": 5160 + }, + { + "epoch": 3.3790849673202614, + "grad_norm": 0.7636350393295288, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 5170 + }, + { + "epoch": 3.3856209150326797, + "grad_norm": 0.6887359619140625, + "learning_rate": 0.0002, + "loss": 1.0708, + "step": 5180 + }, + { + "epoch": 3.392156862745098, + "grad_norm": 0.8141424655914307, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 5190 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.694423496723175, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5200 + }, + { + "epoch": 3.4052287581699345, + "grad_norm": 0.914013683795929, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5210 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.8503239750862122, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 5220 + }, + { + "epoch": 3.418300653594771, + "grad_norm": 0.6196836233139038, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 5230 + }, + { + "epoch": 3.4248366013071894, + "grad_norm": 1.0760811567306519, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 5240 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 0.6524698138237, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 5250 + }, + { + "epoch": 3.4379084967320264, + "grad_norm": 0.674467921257019, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5260 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7690372467041016, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 5270 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.8751813769340515, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 5280 + }, + { + "epoch": 3.457516339869281, + "grad_norm": 0.750407874584198, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 5290 + }, + { + "epoch": 3.4640522875816995, + "grad_norm": 0.5991823077201843, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 5300 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.0164772272109985, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 5310 + }, + { + "epoch": 3.477124183006536, + "grad_norm": 0.8704105019569397, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 5320 + }, + { + "epoch": 3.4836601307189543, + "grad_norm": 0.709102213382721, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 5330 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.6273632049560547, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 5340 + }, + { + "epoch": 3.496732026143791, + "grad_norm": 0.6807359457015991, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 5350 + }, + { + "epoch": 3.503267973856209, + "grad_norm": 0.7085188627243042, + "learning_rate": 0.0002, + "loss": 1.131, + "step": 5360 + }, + { + "epoch": 3.5098039215686274, + "grad_norm": 0.6938307881355286, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5370 + }, + { + "epoch": 3.5163398692810457, + "grad_norm": 0.8544146418571472, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 5380 + }, + { + "epoch": 3.522875816993464, + "grad_norm": 0.7889642119407654, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 5390 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.7858421206474304, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 5400 + }, + { + "epoch": 3.5359477124183005, + "grad_norm": 0.8547123074531555, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5410 + }, + { + "epoch": 3.542483660130719, + "grad_norm": 0.8218181133270264, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 5420 + }, + { + "epoch": 3.549019607843137, + "grad_norm": 1.153623342514038, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 5430 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.1321099996566772, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 5440 + }, + { + "epoch": 3.5620915032679736, + "grad_norm": 0.9495334029197693, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 5450 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.8743821978569031, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 5460 + }, + { + "epoch": 3.57516339869281, + "grad_norm": 0.7513086795806885, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 5470 + }, + { + "epoch": 3.581699346405229, + "grad_norm": 1.0139480829238892, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 5480 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 0.6615135073661804, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 5490 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 1.180798888206482, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 5500 + }, + { + "epoch": 3.6013071895424837, + "grad_norm": 0.7085279226303101, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 5510 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.540268063545227, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 5520 + }, + { + "epoch": 3.6143790849673203, + "grad_norm": 0.7905671000480652, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 5530 + }, + { + "epoch": 3.6209150326797386, + "grad_norm": 0.8457717299461365, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 5540 + }, + { + "epoch": 3.627450980392157, + "grad_norm": 0.7102677822113037, + "learning_rate": 0.0002, + "loss": 1.1799, + "step": 5550 + }, + { + "epoch": 3.633986928104575, + "grad_norm": 0.7179514765739441, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 5560 + }, + { + "epoch": 3.6405228758169934, + "grad_norm": 1.0854148864746094, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 5570 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.8209951519966125, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5580 + }, + { + "epoch": 3.65359477124183, + "grad_norm": 0.6944138407707214, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 5590 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.7675473093986511, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 5600 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 5610 + }, + { + "epoch": 3.6732026143790852, + "grad_norm": 0.7920727133750916, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 5620 + }, + { + "epoch": 3.6797385620915035, + "grad_norm": 0.9440218806266785, + "learning_rate": 0.0002, + "loss": 1.2287, + "step": 5630 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.6600824594497681, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 5640 + }, + { + "epoch": 3.69281045751634, + "grad_norm": 0.6860619187355042, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 5650 + }, + { + "epoch": 3.6993464052287583, + "grad_norm": 0.6579713225364685, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 5660 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.661081075668335, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 5670 + }, + { + "epoch": 3.712418300653595, + "grad_norm": 1.0968825817108154, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 5680 + }, + { + "epoch": 3.718954248366013, + "grad_norm": 0.8066844940185547, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 5690 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.8341682553291321, + "learning_rate": 0.0002, + "loss": 1.2322, + "step": 5700 + }, + { + "epoch": 3.7320261437908497, + "grad_norm": 0.6682852506637573, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 5710 + }, + { + "epoch": 3.738562091503268, + "grad_norm": 0.898595929145813, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 5720 + }, + { + "epoch": 3.7450980392156863, + "grad_norm": 0.6876054406166077, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 5730 + }, + { + "epoch": 3.7516339869281046, + "grad_norm": 0.7817103266716003, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 5740 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 0.5840168595314026, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 5750 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.6263918876647949, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5760 + }, + { + "epoch": 3.7712418300653594, + "grad_norm": 0.7948952317237854, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 5770 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6700998544692993, + "learning_rate": 0.0002, + "loss": 1.149, + "step": 5780 + }, + { + "epoch": 3.784313725490196, + "grad_norm": 1.1169519424438477, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 5790 + }, + { + "epoch": 3.7908496732026142, + "grad_norm": 0.8354471325874329, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5800 + }, + { + "epoch": 3.7973856209150325, + "grad_norm": 0.6304181814193726, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 5810 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.6919655799865723, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 5820 + }, + { + "epoch": 3.810457516339869, + "grad_norm": 0.600385844707489, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5830 + }, + { + "epoch": 3.8169934640522873, + "grad_norm": 0.8406319618225098, + "learning_rate": 0.0002, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 0.7594282031059265, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5850 + }, + { + "epoch": 3.8300653594771243, + "grad_norm": 0.8179879784584045, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 5860 + }, + { + "epoch": 3.8366013071895426, + "grad_norm": 1.141430377960205, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 5870 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.6595550775527954, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 5880 + }, + { + "epoch": 3.849673202614379, + "grad_norm": 0.7499435544013977, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 5890 + }, + { + "epoch": 3.8562091503267975, + "grad_norm": 0.7851517200469971, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5900 + }, + { + "epoch": 3.8627450980392157, + "grad_norm": 1.0533545017242432, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 5910 + }, + { + "epoch": 3.869281045751634, + "grad_norm": 0.960086464881897, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5920 + }, + { + "epoch": 3.8758169934640523, + "grad_norm": 0.9952049851417542, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 5930 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.7884191274642944, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 5940 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.7461766600608826, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5950 + }, + { + "epoch": 3.895424836601307, + "grad_norm": 0.9594355821609497, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 5960 + }, + { + "epoch": 3.9019607843137254, + "grad_norm": 0.8179471492767334, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 5970 + }, + { + "epoch": 3.9084967320261437, + "grad_norm": 0.8240267634391785, + "learning_rate": 0.0002, + "loss": 1.2421, + "step": 5980 + }, + { + "epoch": 3.915032679738562, + "grad_norm": 0.7462618350982666, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 5990 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.711207389831543, + "learning_rate": 0.0002, + "loss": 1.2124, + "step": 6000 + }, + { + "epoch": 3.928104575163399, + "grad_norm": 0.6910956501960754, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 6010 + }, + { + "epoch": 3.9346405228758172, + "grad_norm": 0.749093770980835, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 6020 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.3332762718200684, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6030 + }, + { + "epoch": 3.947712418300654, + "grad_norm": 0.71457439661026, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6040 + }, + { + "epoch": 3.954248366013072, + "grad_norm": 1.1205238103866577, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 6050 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.6958928108215332, + "learning_rate": 0.0002, + "loss": 1.2962, + "step": 6060 + }, + { + "epoch": 3.9673202614379086, + "grad_norm": 0.7518056035041809, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 6070 + }, + { + "epoch": 3.973856209150327, + "grad_norm": 0.8010755777359009, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 6080 + }, + { + "epoch": 3.980392156862745, + "grad_norm": 0.7492658495903015, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6090 + }, + { + "epoch": 3.9869281045751634, + "grad_norm": 0.900704562664032, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 6100 + }, + { + "epoch": 3.9934640522875817, + "grad_norm": 0.7997331619262695, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 6110 + }, + { + "epoch": 4.0, + "grad_norm": 0.7163209319114685, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 6120 + }, + { + "epoch": 4.0, + "eval_loss": 1.4113320112228394, + "eval_runtime": 33.7199, + "eval_samples_per_second": 12.93, + "eval_steps_per_second": 1.631, + "step": 6120 + }, + { + "epoch": 4.006535947712418, + "grad_norm": 0.9527022838592529, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 6130 + }, + { + "epoch": 4.0130718954248366, + "grad_norm": 0.7603210210800171, + "learning_rate": 0.0002, + "loss": 1.101, + "step": 6140 + }, + { + "epoch": 4.019607843137255, + "grad_norm": 1.127387523651123, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 6150 + }, + { + "epoch": 4.026143790849673, + "grad_norm": 0.8290133476257324, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 6160 + }, + { + "epoch": 4.032679738562091, + "grad_norm": 0.9912241101264954, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 6170 + }, + { + "epoch": 4.03921568627451, + "grad_norm": 0.947005033493042, + "learning_rate": 0.0002, + "loss": 1.0719, + "step": 6180 + }, + { + "epoch": 4.045751633986928, + "grad_norm": 0.707466185092926, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 6190 + }, + { + "epoch": 4.052287581699346, + "grad_norm": 1.0604327917099, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6200 + }, + { + "epoch": 4.0588235294117645, + "grad_norm": 0.7848685383796692, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6210 + }, + { + "epoch": 4.065359477124183, + "grad_norm": 0.8475256562232971, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 6220 + }, + { + "epoch": 4.071895424836601, + "grad_norm": 0.9759448766708374, + "learning_rate": 0.0002, + "loss": 1.1104, + "step": 6230 + }, + { + "epoch": 4.078431372549019, + "grad_norm": 0.9324519038200378, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 6240 + }, + { + "epoch": 4.084967320261438, + "grad_norm": 0.8723901510238647, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 6250 + }, + { + "epoch": 4.091503267973856, + "grad_norm": 0.8343415856361389, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6260 + }, + { + "epoch": 4.098039215686274, + "grad_norm": 0.7490310072898865, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 6270 + }, + { + "epoch": 4.104575163398692, + "grad_norm": 0.8961182832717896, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 6280 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7124854922294617, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 6290 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.8338138461112976, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 6300 + }, + { + "epoch": 4.124183006535947, + "grad_norm": 0.8075833320617676, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 6310 + }, + { + "epoch": 4.130718954248366, + "grad_norm": 0.8069391846656799, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 6320 + }, + { + "epoch": 4.137254901960785, + "grad_norm": 0.9567893147468567, + "learning_rate": 0.0002, + "loss": 0.948, + "step": 6330 + }, + { + "epoch": 4.143790849673203, + "grad_norm": 1.2184662818908691, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 6340 + }, + { + "epoch": 4.150326797385621, + "grad_norm": 1.030976414680481, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 6350 + }, + { + "epoch": 4.1568627450980395, + "grad_norm": 0.9749957323074341, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6360 + }, + { + "epoch": 4.163398692810458, + "grad_norm": 0.7089483141899109, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 6370 + }, + { + "epoch": 4.169934640522876, + "grad_norm": 1.1084946393966675, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 6380 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 0.7998497486114502, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 6390 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 0.8997811675071716, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 6400 + }, + { + "epoch": 4.189542483660131, + "grad_norm": 0.8359479904174805, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 6410 + }, + { + "epoch": 4.196078431372549, + "grad_norm": 0.9087472558021545, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 6420 + }, + { + "epoch": 4.2026143790849675, + "grad_norm": 1.1100451946258545, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 6430 + }, + { + "epoch": 4.209150326797386, + "grad_norm": 0.9376999735832214, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 6440 + }, + { + "epoch": 4.215686274509804, + "grad_norm": 0.8179266452789307, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 6450 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9953271746635437, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 6460 + }, + { + "epoch": 4.228758169934641, + "grad_norm": 0.8476650714874268, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6470 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.8406323194503784, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6480 + }, + { + "epoch": 4.241830065359477, + "grad_norm": 0.819134533405304, + "learning_rate": 0.0002, + "loss": 1.057, + "step": 6490 + }, + { + "epoch": 4.248366013071895, + "grad_norm": 0.7764983773231506, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 6500 + }, + { + "epoch": 4.254901960784314, + "grad_norm": 0.8252112865447998, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 6510 + }, + { + "epoch": 4.261437908496732, + "grad_norm": 0.7941019535064697, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6520 + }, + { + "epoch": 4.26797385620915, + "grad_norm": 0.7673905491828918, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 6530 + }, + { + "epoch": 4.2745098039215685, + "grad_norm": 0.8749890327453613, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6540 + }, + { + "epoch": 4.281045751633987, + "grad_norm": 0.7343207597732544, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 4.287581699346405, + "grad_norm": 1.2786651849746704, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6560 + }, + { + "epoch": 4.294117647058823, + "grad_norm": 1.316875696182251, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6570 + }, + { + "epoch": 4.300653594771242, + "grad_norm": 0.8349189162254333, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 6580 + }, + { + "epoch": 4.30718954248366, + "grad_norm": 0.7510647177696228, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6590 + }, + { + "epoch": 4.313725490196078, + "grad_norm": 0.932420551776886, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6600 + }, + { + "epoch": 4.3202614379084965, + "grad_norm": 0.8510616421699524, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 6610 + }, + { + "epoch": 4.326797385620915, + "grad_norm": 0.7661547064781189, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6620 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.0370930433273315, + "learning_rate": 0.0002, + "loss": 1.2064, + "step": 6630 + }, + { + "epoch": 4.339869281045751, + "grad_norm": 0.9302158951759338, + "learning_rate": 0.0002, + "loss": 1.1064, + "step": 6640 + }, + { + "epoch": 4.34640522875817, + "grad_norm": 0.9203811883926392, + "learning_rate": 0.0002, + "loss": 0.968, + "step": 6650 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.9986332654953003, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 6660 + }, + { + "epoch": 4.359477124183006, + "grad_norm": 0.8001713156700134, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6670 + }, + { + "epoch": 4.366013071895424, + "grad_norm": 0.829714298248291, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 6680 + }, + { + "epoch": 4.372549019607844, + "grad_norm": 0.8253079056739807, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 6690 + }, + { + "epoch": 4.379084967320262, + "grad_norm": 0.824666440486908, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 6700 + }, + { + "epoch": 4.38562091503268, + "grad_norm": 0.8872972130775452, + "learning_rate": 0.0002, + "loss": 1.1968, + "step": 6710 + }, + { + "epoch": 4.392156862745098, + "grad_norm": 0.8729761838912964, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 6720 + }, + { + "epoch": 4.398692810457517, + "grad_norm": 1.1367264986038208, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 6730 + }, + { + "epoch": 4.405228758169935, + "grad_norm": 0.9699058532714844, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 6740 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 0.8266763687133789, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 6750 + }, + { + "epoch": 4.4183006535947715, + "grad_norm": 1.0249767303466797, + "learning_rate": 0.0002, + "loss": 1.0735, + "step": 6760 + }, + { + "epoch": 4.42483660130719, + "grad_norm": 0.73606938123703, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 6770 + }, + { + "epoch": 4.431372549019608, + "grad_norm": 1.4050679206848145, + "learning_rate": 0.0002, + "loss": 1.1037, + "step": 6780 + }, + { + "epoch": 4.437908496732026, + "grad_norm": 1.1114081144332886, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6790 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8031067848205566, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 6800 + }, + { + "epoch": 4.450980392156863, + "grad_norm": 0.8513566851615906, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6810 + }, + { + "epoch": 4.457516339869281, + "grad_norm": 1.332741379737854, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 6820 + }, + { + "epoch": 4.4640522875816995, + "grad_norm": 1.5032578706741333, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 6830 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.7677283883094788, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6840 + }, + { + "epoch": 4.477124183006536, + "grad_norm": 0.989148736000061, + "learning_rate": 0.0002, + "loss": 1.1501, + "step": 6850 + }, + { + "epoch": 4.483660130718954, + "grad_norm": 1.5316275358200073, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6860 + }, + { + "epoch": 4.490196078431373, + "grad_norm": 0.9427124261856079, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 6870 + }, + { + "epoch": 4.496732026143791, + "grad_norm": 1.215287685394287, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 6880 + }, + { + "epoch": 4.503267973856209, + "grad_norm": 0.7286760210990906, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 6890 + }, + { + "epoch": 4.509803921568627, + "grad_norm": 0.874829888343811, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6900 + }, + { + "epoch": 4.516339869281046, + "grad_norm": 0.8058359622955322, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 6910 + }, + { + "epoch": 4.522875816993464, + "grad_norm": 1.248195767402649, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 0.8033645749092102, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 6930 + }, + { + "epoch": 4.5359477124183005, + "grad_norm": 1.7361950874328613, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 6940 + }, + { + "epoch": 4.542483660130719, + "grad_norm": 0.8058095574378967, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 6950 + }, + { + "epoch": 4.549019607843137, + "grad_norm": 1.254089593887329, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 6960 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.9180455803871155, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 6970 + }, + { + "epoch": 4.562091503267974, + "grad_norm": 0.6677682399749756, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 6980 + }, + { + "epoch": 4.568627450980392, + "grad_norm": 0.8127354383468628, + "learning_rate": 0.0002, + "loss": 1.0453, + "step": 6990 + }, + { + "epoch": 4.57516339869281, + "grad_norm": 1.0263001918792725, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7000 + }, + { + "epoch": 4.5816993464052285, + "grad_norm": 0.9641909003257751, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 7010 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 0.9440861344337463, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 7020 + }, + { + "epoch": 4.594771241830065, + "grad_norm": 0.9539011716842651, + "learning_rate": 0.0002, + "loss": 1.0931, + "step": 7030 + }, + { + "epoch": 4.601307189542483, + "grad_norm": 1.0449910163879395, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7040 + }, + { + "epoch": 4.607843137254902, + "grad_norm": 0.8766893744468689, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7050 + }, + { + "epoch": 4.61437908496732, + "grad_norm": 0.6983462572097778, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 7060 + }, + { + "epoch": 4.620915032679738, + "grad_norm": 0.9505505561828613, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 7070 + }, + { + "epoch": 4.627450980392156, + "grad_norm": 1.2506657838821411, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7080 + }, + { + "epoch": 4.633986928104575, + "grad_norm": 0.9602801203727722, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 7090 + }, + { + "epoch": 4.640522875816993, + "grad_norm": 0.7398977875709534, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7100 + }, + { + "epoch": 4.647058823529412, + "grad_norm": 1.3862425088882446, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 7110 + }, + { + "epoch": 4.65359477124183, + "grad_norm": 1.1451990604400635, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 7120 + }, + { + "epoch": 4.660130718954249, + "grad_norm": 0.9010422229766846, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 7130 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.7102518081665039, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 7140 + }, + { + "epoch": 4.673202614379085, + "grad_norm": 0.7963796257972717, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7150 + }, + { + "epoch": 4.6797385620915035, + "grad_norm": 0.7726007699966431, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 7160 + }, + { + "epoch": 4.686274509803922, + "grad_norm": 0.8097564578056335, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 7170 + }, + { + "epoch": 4.69281045751634, + "grad_norm": 0.9070925116539001, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 7180 + }, + { + "epoch": 4.699346405228758, + "grad_norm": 0.7543528079986572, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 7190 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9900904893875122, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7200 + }, + { + "epoch": 4.712418300653595, + "grad_norm": 0.8033412098884583, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 7210 + }, + { + "epoch": 4.718954248366013, + "grad_norm": 0.8440839052200317, + "learning_rate": 0.0002, + "loss": 1.1773, + "step": 7220 + }, + { + "epoch": 4.7254901960784315, + "grad_norm": 0.9325555562973022, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 7230 + }, + { + "epoch": 4.73202614379085, + "grad_norm": 0.7881146669387817, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7240 + }, + { + "epoch": 4.738562091503268, + "grad_norm": 0.884453296661377, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7250 + }, + { + "epoch": 4.745098039215686, + "grad_norm": 0.9274539351463318, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7260 + }, + { + "epoch": 4.751633986928105, + "grad_norm": 1.2367479801177979, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7270 + }, + { + "epoch": 4.758169934640523, + "grad_norm": 0.9499821066856384, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7280 + }, + { + "epoch": 4.764705882352941, + "grad_norm": 2.1918580532073975, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 7290 + }, + { + "epoch": 4.771241830065359, + "grad_norm": 0.8221880793571472, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 7300 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.871972918510437, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7310 + }, + { + "epoch": 4.784313725490196, + "grad_norm": 0.8034510612487793, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 7320 + }, + { + "epoch": 4.790849673202614, + "grad_norm": 0.8959605693817139, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 7330 + }, + { + "epoch": 4.7973856209150325, + "grad_norm": 1.2326215505599976, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7340 + }, + { + "epoch": 4.803921568627451, + "grad_norm": 0.9725791811943054, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7350 + }, + { + "epoch": 4.810457516339869, + "grad_norm": 0.7240816354751587, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 7360 + }, + { + "epoch": 4.816993464052287, + "grad_norm": 0.8265769481658936, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 7370 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 0.8888696432113647, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 7380 + }, + { + "epoch": 4.830065359477124, + "grad_norm": 0.7776556015014648, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 7390 + }, + { + "epoch": 4.836601307189542, + "grad_norm": 0.8772371411323547, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7400 + }, + { + "epoch": 4.8431372549019605, + "grad_norm": 0.9786531925201416, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7410 + }, + { + "epoch": 4.849673202614379, + "grad_norm": 0.9059745073318481, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7420 + }, + { + "epoch": 4.856209150326797, + "grad_norm": 0.7422552108764648, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 7430 + }, + { + "epoch": 4.862745098039216, + "grad_norm": 1.3040380477905273, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 7440 + }, + { + "epoch": 4.8692810457516345, + "grad_norm": 1.3278473615646362, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 7450 + }, + { + "epoch": 4.875816993464053, + "grad_norm": 1.2705849409103394, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 7460 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 0.8837892413139343, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 7470 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8670691251754761, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 7480 + }, + { + "epoch": 4.895424836601308, + "grad_norm": 0.9662758111953735, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 7490 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 0.8188302516937256, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 7500 + }, + { + "epoch": 4.908496732026144, + "grad_norm": 0.769442617893219, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 7510 + }, + { + "epoch": 4.915032679738562, + "grad_norm": 1.1465084552764893, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 7520 + }, + { + "epoch": 4.921568627450981, + "grad_norm": 1.253214955329895, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 7530 + }, + { + "epoch": 4.928104575163399, + "grad_norm": 0.7922375202178955, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 7540 + }, + { + "epoch": 4.934640522875817, + "grad_norm": 0.8306851387023926, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7550 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.8486151099205017, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 7560 + }, + { + "epoch": 4.947712418300654, + "grad_norm": 1.2601467370986938, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7570 + }, + { + "epoch": 4.954248366013072, + "grad_norm": 0.7980747818946838, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 7580 + }, + { + "epoch": 4.96078431372549, + "grad_norm": 0.8653254508972168, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 7590 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 0.9680571556091309, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 7600 + }, + { + "epoch": 4.973856209150327, + "grad_norm": 0.9554466605186462, + "learning_rate": 0.0002, + "loss": 1.1795, + "step": 7610 + }, + { + "epoch": 4.980392156862745, + "grad_norm": 1.3693897724151611, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 7620 + }, + { + "epoch": 4.9869281045751634, + "grad_norm": 0.7809282541275024, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 7630 + }, + { + "epoch": 4.993464052287582, + "grad_norm": 0.7528006434440613, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 7640 + }, + { + "epoch": 5.0, + "grad_norm": 1.7491309642791748, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 7650 + }, + { + "epoch": 5.0, + "eval_loss": 1.4197258949279785, + "eval_runtime": 33.6327, + "eval_samples_per_second": 12.964, + "eval_steps_per_second": 1.635, + "step": 7650 + }, + { + "epoch": 5.006535947712418, + "grad_norm": 0.8840063214302063, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 7660 + }, + { + "epoch": 5.0130718954248366, + "grad_norm": 1.0118401050567627, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 7670 + }, + { + "epoch": 5.019607843137255, + "grad_norm": 1.0040518045425415, + "learning_rate": 0.0002, + "loss": 1.1667, + "step": 7680 + }, + { + "epoch": 5.026143790849673, + "grad_norm": 0.7541199922561646, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 7690 + }, + { + "epoch": 5.032679738562091, + "grad_norm": 0.9106482863426208, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 7700 + }, + { + "epoch": 5.03921568627451, + "grad_norm": 1.3691469430923462, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 7710 + }, + { + "epoch": 5.045751633986928, + "grad_norm": 0.9449689388275146, + "learning_rate": 0.0002, + "loss": 0.9889, + "step": 7720 + }, + { + "epoch": 5.052287581699346, + "grad_norm": 1.1678508520126343, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 7730 + }, + { + "epoch": 5.0588235294117645, + "grad_norm": 1.1296145915985107, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7740 + }, + { + "epoch": 5.065359477124183, + "grad_norm": 0.7863904237747192, + "learning_rate": 0.0002, + "loss": 0.9339, + "step": 7750 + }, + { + "epoch": 5.071895424836601, + "grad_norm": 0.8691433072090149, + "learning_rate": 0.0002, + "loss": 1.0135, + "step": 7760 + }, + { + "epoch": 5.078431372549019, + "grad_norm": 1.0722088813781738, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 7770 + }, + { + "epoch": 5.084967320261438, + "grad_norm": 0.9625038504600525, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 7780 + }, + { + "epoch": 5.091503267973856, + "grad_norm": 1.2618783712387085, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 7790 + }, + { + "epoch": 5.098039215686274, + "grad_norm": 0.9970650672912598, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 7800 + }, + { + "epoch": 5.104575163398692, + "grad_norm": 1.3946677446365356, + "learning_rate": 0.0002, + "loss": 0.9186, + "step": 7810 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.0260052680969238, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 7820 + }, + { + "epoch": 5.117647058823529, + "grad_norm": 1.105521559715271, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 7830 + }, + { + "epoch": 5.124183006535947, + "grad_norm": 1.003641128540039, + "learning_rate": 0.0002, + "loss": 0.9788, + "step": 7840 + }, + { + "epoch": 5.130718954248366, + "grad_norm": 1.0315021276474, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 7850 + }, + { + "epoch": 5.137254901960785, + "grad_norm": 0.9469530582427979, + "learning_rate": 0.0002, + "loss": 1.0001, + "step": 7860 + }, + { + "epoch": 5.143790849673203, + "grad_norm": 1.3244667053222656, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7870 + }, + { + "epoch": 5.150326797385621, + "grad_norm": 1.1732033491134644, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 7880 + }, + { + "epoch": 5.1568627450980395, + "grad_norm": 1.3129149675369263, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 7890 + }, + { + "epoch": 5.163398692810458, + "grad_norm": 0.8589454293251038, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7900 + }, + { + "epoch": 5.169934640522876, + "grad_norm": 0.8954233527183533, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7910 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 0.7426522970199585, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 7920 + }, + { + "epoch": 5.183006535947713, + "grad_norm": 1.1990121603012085, + "learning_rate": 0.0002, + "loss": 1.0106, + "step": 7930 + }, + { + "epoch": 5.189542483660131, + "grad_norm": 0.8867580890655518, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 7940 + }, + { + "epoch": 5.196078431372549, + "grad_norm": 1.016276478767395, + "learning_rate": 0.0002, + "loss": 0.9727, + "step": 7950 + }, + { + "epoch": 5.2026143790849675, + "grad_norm": 1.0210685729980469, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 7960 + }, + { + "epoch": 5.209150326797386, + "grad_norm": 1.0093122720718384, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7970 + }, + { + "epoch": 5.215686274509804, + "grad_norm": 0.9746801853179932, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 7980 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.9113537073135376, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 7990 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 1.2782206535339355, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 8000 + }, + { + "epoch": 5.235294117647059, + "grad_norm": 1.3223118782043457, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 8010 + }, + { + "epoch": 5.241830065359477, + "grad_norm": 0.7898629307746887, + "learning_rate": 0.0002, + "loss": 0.9244, + "step": 8020 + }, + { + "epoch": 5.248366013071895, + "grad_norm": 0.9822350740432739, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 8030 + }, + { + "epoch": 5.254901960784314, + "grad_norm": 1.5114340782165527, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 8040 + }, + { + "epoch": 5.261437908496732, + "grad_norm": 0.859006941318512, + "learning_rate": 0.0002, + "loss": 0.9816, + "step": 8050 + }, + { + "epoch": 5.26797385620915, + "grad_norm": 1.0495043992996216, + "learning_rate": 0.0002, + "loss": 0.9445, + "step": 8060 + }, + { + "epoch": 5.2745098039215685, + "grad_norm": 1.329483151435852, + "learning_rate": 0.0002, + "loss": 0.9724, + "step": 8070 + }, + { + "epoch": 5.281045751633987, + "grad_norm": 1.1333061456680298, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 8080 + }, + { + "epoch": 5.287581699346405, + "grad_norm": 0.8153108358383179, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 8090 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 0.9395004510879517, + "learning_rate": 0.0002, + "loss": 0.9002, + "step": 8100 + }, + { + "epoch": 5.300653594771242, + "grad_norm": 0.8907593488693237, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 8110 + }, + { + "epoch": 5.30718954248366, + "grad_norm": 0.9808667898178101, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 8120 + }, + { + "epoch": 5.313725490196078, + "grad_norm": 0.984779417514801, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 8130 + }, + { + "epoch": 5.3202614379084965, + "grad_norm": 0.9787270426750183, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 8140 + }, + { + "epoch": 5.326797385620915, + "grad_norm": 0.9857710599899292, + "learning_rate": 0.0002, + "loss": 0.9336, + "step": 8150 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9774303436279297, + "learning_rate": 0.0002, + "loss": 0.9884, + "step": 8160 + }, + { + "epoch": 5.339869281045751, + "grad_norm": 0.677925169467926, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 8170 + }, + { + "epoch": 5.34640522875817, + "grad_norm": 0.9576456546783447, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 8180 + }, + { + "epoch": 5.352941176470588, + "grad_norm": 1.8970937728881836, + "learning_rate": 0.0002, + "loss": 0.9554, + "step": 8190 + }, + { + "epoch": 5.359477124183006, + "grad_norm": 0.9458389282226562, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 8200 + }, + { + "epoch": 5.366013071895424, + "grad_norm": 1.761794924736023, + "learning_rate": 0.0002, + "loss": 1.0365, + "step": 8210 + }, + { + "epoch": 5.372549019607844, + "grad_norm": 1.0693724155426025, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 8220 + }, + { + "epoch": 5.379084967320262, + "grad_norm": 0.9025877714157104, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 8230 + }, + { + "epoch": 5.38562091503268, + "grad_norm": 1.258857250213623, + "learning_rate": 0.0002, + "loss": 0.9652, + "step": 8240 + }, + { + "epoch": 5.392156862745098, + "grad_norm": 1.084849238395691, + "learning_rate": 0.0002, + "loss": 0.9735, + "step": 8250 + }, + { + "epoch": 5.398692810457517, + "grad_norm": 0.9530340433120728, + "learning_rate": 0.0002, + "loss": 0.9999, + "step": 8260 + }, + { + "epoch": 5.405228758169935, + "grad_norm": 0.830240786075592, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 8270 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 1.5807015895843506, + "learning_rate": 0.0002, + "loss": 1.0332, + "step": 8280 + }, + { + "epoch": 5.4183006535947715, + "grad_norm": 0.9486905336380005, + "learning_rate": 0.0002, + "loss": 0.9146, + "step": 8290 + }, + { + "epoch": 5.42483660130719, + "grad_norm": 1.0415093898773193, + "learning_rate": 0.0002, + "loss": 1.0336, + "step": 8300 + }, + { + "epoch": 5.431372549019608, + "grad_norm": 1.0501102209091187, + "learning_rate": 0.0002, + "loss": 0.8933, + "step": 8310 + }, + { + "epoch": 5.437908496732026, + "grad_norm": 0.9751836061477661, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 8320 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.5529173612594604, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 8330 + }, + { + "epoch": 5.450980392156863, + "grad_norm": 0.8314350247383118, + "learning_rate": 0.0002, + "loss": 0.9814, + "step": 8340 + }, + { + "epoch": 5.457516339869281, + "grad_norm": 1.2555103302001953, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 8350 + }, + { + "epoch": 5.4640522875816995, + "grad_norm": 0.9408367872238159, + "learning_rate": 0.0002, + "loss": 1.0127, + "step": 8360 + }, + { + "epoch": 5.470588235294118, + "grad_norm": 0.9483312964439392, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 8370 + }, + { + "epoch": 5.477124183006536, + "grad_norm": 0.957905650138855, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8380 + }, + { + "epoch": 5.483660130718954, + "grad_norm": 1.4000147581100464, + "learning_rate": 0.0002, + "loss": 1.0985, + "step": 8390 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 1.7032461166381836, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 8400 + }, + { + "epoch": 5.496732026143791, + "grad_norm": 0.8978716731071472, + "learning_rate": 0.0002, + "loss": 0.9539, + "step": 8410 + }, + { + "epoch": 5.503267973856209, + "grad_norm": 0.8659300804138184, + "learning_rate": 0.0002, + "loss": 0.9544, + "step": 8420 + }, + { + "epoch": 5.509803921568627, + "grad_norm": 1.3629727363586426, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 8430 + }, + { + "epoch": 5.516339869281046, + "grad_norm": 1.2741984128952026, + "learning_rate": 0.0002, + "loss": 0.9696, + "step": 8440 + }, + { + "epoch": 5.522875816993464, + "grad_norm": 1.3867180347442627, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 8450 + }, + { + "epoch": 5.529411764705882, + "grad_norm": 1.0662001371383667, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 8460 + }, + { + "epoch": 5.5359477124183005, + "grad_norm": 1.7005380392074585, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 8470 + }, + { + "epoch": 5.542483660130719, + "grad_norm": 1.3730385303497314, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 8480 + }, + { + "epoch": 5.549019607843137, + "grad_norm": 1.7737441062927246, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 8490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.907487690448761, + "learning_rate": 0.0002, + "loss": 0.9729, + "step": 8500 + }, + { + "epoch": 5.562091503267974, + "grad_norm": 0.8882441520690918, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 8510 + }, + { + "epoch": 5.568627450980392, + "grad_norm": 0.8655388951301575, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 8520 + }, + { + "epoch": 5.57516339869281, + "grad_norm": 1.379992961883545, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 8530 + }, + { + "epoch": 5.5816993464052285, + "grad_norm": 1.0021201372146606, + "learning_rate": 0.0002, + "loss": 1.0174, + "step": 8540 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 1.2636926174163818, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 8550 + }, + { + "epoch": 5.594771241830065, + "grad_norm": 1.279025912284851, + "learning_rate": 0.0002, + "loss": 1.0243, + "step": 8560 + }, + { + "epoch": 5.601307189542483, + "grad_norm": 0.8885834217071533, + "learning_rate": 0.0002, + "loss": 0.9917, + "step": 8570 + }, + { + "epoch": 5.607843137254902, + "grad_norm": 1.1975032091140747, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 8580 + }, + { + "epoch": 5.61437908496732, + "grad_norm": 1.005470871925354, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8590 + }, + { + "epoch": 5.620915032679738, + "grad_norm": 1.104286551475525, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 8600 + }, + { + "epoch": 5.627450980392156, + "grad_norm": 1.435445785522461, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 8610 + }, + { + "epoch": 5.633986928104575, + "grad_norm": 1.0270172357559204, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 8620 + }, + { + "epoch": 5.640522875816993, + "grad_norm": 1.0929527282714844, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 8630 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 1.1061221361160278, + "learning_rate": 0.0002, + "loss": 0.9694, + "step": 8640 + }, + { + "epoch": 5.65359477124183, + "grad_norm": 0.9563149213790894, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8650 + }, + { + "epoch": 5.660130718954249, + "grad_norm": 1.0434954166412354, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 8660 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.3695117235183716, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 8670 + }, + { + "epoch": 5.673202614379085, + "grad_norm": 1.0540564060211182, + "learning_rate": 0.0002, + "loss": 0.9441, + "step": 8680 + }, + { + "epoch": 5.6797385620915035, + "grad_norm": 1.5942492485046387, + "learning_rate": 0.0002, + "loss": 0.9755, + "step": 8690 + }, + { + "epoch": 5.686274509803922, + "grad_norm": 0.9485495090484619, + "learning_rate": 0.0002, + "loss": 1.0071, + "step": 8700 + }, + { + "epoch": 5.69281045751634, + "grad_norm": 1.1483162641525269, + "learning_rate": 0.0002, + "loss": 0.9998, + "step": 8710 + }, + { + "epoch": 5.699346405228758, + "grad_norm": 0.9075471758842468, + "learning_rate": 0.0002, + "loss": 0.9578, + "step": 8720 + }, + { + "epoch": 5.705882352941177, + "grad_norm": 1.7908551692962646, + "learning_rate": 0.0002, + "loss": 0.9488, + "step": 8730 + }, + { + "epoch": 5.712418300653595, + "grad_norm": 0.8867162466049194, + "learning_rate": 0.0002, + "loss": 1.0163, + "step": 8740 + }, + { + "epoch": 5.718954248366013, + "grad_norm": 1.7165148258209229, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 8750 + }, + { + "epoch": 5.7254901960784315, + "grad_norm": 0.9529356956481934, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 8760 + }, + { + "epoch": 5.73202614379085, + "grad_norm": 1.01852548122406, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 8770 + }, + { + "epoch": 5.738562091503268, + "grad_norm": 0.9538423418998718, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 8780 + }, + { + "epoch": 5.745098039215686, + "grad_norm": 0.9007737636566162, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 8790 + }, + { + "epoch": 5.751633986928105, + "grad_norm": 0.9107874035835266, + "learning_rate": 0.0002, + "loss": 0.9766, + "step": 8800 + }, + { + "epoch": 5.758169934640523, + "grad_norm": 0.7379238605499268, + "learning_rate": 0.0002, + "loss": 0.9212, + "step": 8810 + }, + { + "epoch": 5.764705882352941, + "grad_norm": 1.072645902633667, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 8820 + }, + { + "epoch": 5.771241830065359, + "grad_norm": 1.002008080482483, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 8830 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 1.0435924530029297, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 8840 + }, + { + "epoch": 5.784313725490196, + "grad_norm": 0.9874551296234131, + "learning_rate": 0.0002, + "loss": 0.9458, + "step": 8850 + }, + { + "epoch": 5.790849673202614, + "grad_norm": 1.1729662418365479, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 8860 + }, + { + "epoch": 5.7973856209150325, + "grad_norm": 1.3300775289535522, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 8870 + }, + { + "epoch": 5.803921568627451, + "grad_norm": 1.612707257270813, + "learning_rate": 0.0002, + "loss": 1.0989, + "step": 8880 + }, + { + "epoch": 5.810457516339869, + "grad_norm": 0.9047797322273254, + "learning_rate": 0.0002, + "loss": 0.9119, + "step": 8890 + }, + { + "epoch": 5.816993464052287, + "grad_norm": 1.0958741903305054, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 8900 + }, + { + "epoch": 5.823529411764706, + "grad_norm": 1.0099612474441528, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 8910 + }, + { + "epoch": 5.830065359477124, + "grad_norm": 0.8442328572273254, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 8920 + }, + { + "epoch": 5.836601307189542, + "grad_norm": 1.1388301849365234, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 8930 + }, + { + "epoch": 5.8431372549019605, + "grad_norm": 0.8296026587486267, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 8940 + }, + { + "epoch": 5.849673202614379, + "grad_norm": 1.0843533277511597, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8950 + }, + { + "epoch": 5.856209150326797, + "grad_norm": 0.8496834635734558, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 8960 + }, + { + "epoch": 5.862745098039216, + "grad_norm": 1.6894690990447998, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 8970 + }, + { + "epoch": 5.8692810457516345, + "grad_norm": 1.0012282133102417, + "learning_rate": 0.0002, + "loss": 1.0939, + "step": 8980 + }, + { + "epoch": 5.875816993464053, + "grad_norm": 0.8521103262901306, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 8990 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 1.246841311454773, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 9000 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9941892027854919, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 9010 + }, + { + "epoch": 5.895424836601308, + "grad_norm": 1.067413568496704, + "learning_rate": 0.0002, + "loss": 0.8754, + "step": 9020 + }, + { + "epoch": 5.901960784313726, + "grad_norm": 1.0045088529586792, + "learning_rate": 0.0002, + "loss": 1.0153, + "step": 9030 + }, + { + "epoch": 5.908496732026144, + "grad_norm": 1.383063554763794, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 9040 + }, + { + "epoch": 5.915032679738562, + "grad_norm": 0.8754428625106812, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 9050 + }, + { + "epoch": 5.921568627450981, + "grad_norm": 0.8577388525009155, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 9060 + }, + { + "epoch": 5.928104575163399, + "grad_norm": 0.8718975186347961, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 9070 + }, + { + "epoch": 5.934640522875817, + "grad_norm": 1.1762131452560425, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 9080 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 1.1025866270065308, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 9090 + }, + { + "epoch": 5.947712418300654, + "grad_norm": 1.0439870357513428, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 9100 + }, + { + "epoch": 5.954248366013072, + "grad_norm": 1.2411525249481201, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 9110 + }, + { + "epoch": 5.96078431372549, + "grad_norm": 1.0317714214324951, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 9120 + }, + { + "epoch": 5.967320261437909, + "grad_norm": 0.9880492091178894, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 9130 + }, + { + "epoch": 5.973856209150327, + "grad_norm": 0.9039815664291382, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 9140 + }, + { + "epoch": 5.980392156862745, + "grad_norm": 0.9049116373062134, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 9150 + }, + { + "epoch": 5.9869281045751634, + "grad_norm": 0.996749222278595, + "learning_rate": 0.0002, + "loss": 0.9792, + "step": 9160 + }, + { + "epoch": 5.993464052287582, + "grad_norm": 0.8716062307357788, + "learning_rate": 0.0002, + "loss": 0.8857, + "step": 9170 + }, + { + "epoch": 6.0, + "grad_norm": 1.3081294298171997, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 9180 + }, + { + "epoch": 6.0, + "eval_loss": 1.45111083984375, + "eval_runtime": 34.7121, + "eval_samples_per_second": 12.56, + "eval_steps_per_second": 1.584, + "step": 9180 + }, + { + "epoch": 6.006535947712418, + "grad_norm": 1.1378029584884644, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 9190 + }, + { + "epoch": 6.0130718954248366, + "grad_norm": 1.2921233177185059, + "learning_rate": 0.0002, + "loss": 0.8794, + "step": 9200 + }, + { + "epoch": 6.019607843137255, + "grad_norm": 1.039211630821228, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 9210 + }, + { + "epoch": 6.026143790849673, + "grad_norm": 0.9715196490287781, + "learning_rate": 0.0002, + "loss": 0.8524, + "step": 9220 + }, + { + "epoch": 6.032679738562091, + "grad_norm": 1.220642328262329, + "learning_rate": 0.0002, + "loss": 1.035, + "step": 9230 + }, + { + "epoch": 6.03921568627451, + "grad_norm": 0.854360044002533, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 9240 + }, + { + "epoch": 6.045751633986928, + "grad_norm": 0.8806933164596558, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 9250 + }, + { + "epoch": 6.052287581699346, + "grad_norm": 1.4315874576568604, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 9260 + }, + { + "epoch": 6.0588235294117645, + "grad_norm": 0.9382007122039795, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 9270 + }, + { + "epoch": 6.065359477124183, + "grad_norm": 1.2184561491012573, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 9280 + }, + { + "epoch": 6.071895424836601, + "grad_norm": 1.2331548929214478, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 9290 + }, + { + "epoch": 6.078431372549019, + "grad_norm": 1.1112796068191528, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 9300 + }, + { + "epoch": 6.084967320261438, + "grad_norm": 1.4753731489181519, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 9310 + }, + { + "epoch": 6.091503267973856, + "grad_norm": 1.2783401012420654, + "learning_rate": 0.0002, + "loss": 0.9198, + "step": 9320 + }, + { + "epoch": 6.098039215686274, + "grad_norm": 0.9916909337043762, + "learning_rate": 0.0002, + "loss": 0.8294, + "step": 9330 + }, + { + "epoch": 6.104575163398692, + "grad_norm": 0.9300099015235901, + "learning_rate": 0.0002, + "loss": 0.876, + "step": 9340 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 1.4985264539718628, + "learning_rate": 0.0002, + "loss": 0.9064, + "step": 9350 + }, + { + "epoch": 6.117647058823529, + "grad_norm": 1.276380181312561, + "learning_rate": 0.0002, + "loss": 1.0106, + "step": 9360 + }, + { + "epoch": 6.124183006535947, + "grad_norm": 1.181113600730896, + "learning_rate": 0.0002, + "loss": 0.9068, + "step": 9370 + }, + { + "epoch": 6.130718954248366, + "grad_norm": 1.698729395866394, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 9380 + }, + { + "epoch": 6.137254901960785, + "grad_norm": 0.9793189764022827, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 9390 + }, + { + "epoch": 6.143790849673203, + "grad_norm": 1.1942132711410522, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 9400 + }, + { + "epoch": 6.150326797385621, + "grad_norm": 1.2160184383392334, + "learning_rate": 0.0002, + "loss": 0.8762, + "step": 9410 + }, + { + "epoch": 6.1568627450980395, + "grad_norm": 1.0802825689315796, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 9420 + }, + { + "epoch": 6.163398692810458, + "grad_norm": 3.024529218673706, + "learning_rate": 0.0002, + "loss": 0.9055, + "step": 9430 + }, + { + "epoch": 6.169934640522876, + "grad_norm": 0.975062370300293, + "learning_rate": 0.0002, + "loss": 0.8739, + "step": 9440 + }, + { + "epoch": 6.176470588235294, + "grad_norm": 0.9243306517601013, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 9450 + }, + { + "epoch": 6.183006535947713, + "grad_norm": 0.8892099857330322, + "learning_rate": 0.0002, + "loss": 0.947, + "step": 9460 + }, + { + "epoch": 6.189542483660131, + "grad_norm": 1.4151731729507446, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 9470 + }, + { + "epoch": 6.196078431372549, + "grad_norm": 1.064701795578003, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 9480 + }, + { + "epoch": 6.2026143790849675, + "grad_norm": 1.1104519367218018, + "learning_rate": 0.0002, + "loss": 0.906, + "step": 9490 + }, + { + "epoch": 6.209150326797386, + "grad_norm": 1.4788947105407715, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 9500 + }, + { + "epoch": 6.215686274509804, + "grad_norm": 0.7976077795028687, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 9510 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.256864070892334, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 9520 + }, + { + "epoch": 6.228758169934641, + "grad_norm": 1.3874554634094238, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 9530 + }, + { + "epoch": 6.235294117647059, + "grad_norm": 1.9012963771820068, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 9540 + }, + { + "epoch": 6.241830065359477, + "grad_norm": 1.275212287902832, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 9550 + }, + { + "epoch": 6.248366013071895, + "grad_norm": 1.1007417440414429, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 9560 + }, + { + "epoch": 6.254901960784314, + "grad_norm": 1.0602147579193115, + "learning_rate": 0.0002, + "loss": 0.9191, + "step": 9570 + }, + { + "epoch": 6.261437908496732, + "grad_norm": 1.2276418209075928, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 9580 + }, + { + "epoch": 6.26797385620915, + "grad_norm": 1.0111924409866333, + "learning_rate": 0.0002, + "loss": 0.9363, + "step": 9590 + }, + { + "epoch": 6.2745098039215685, + "grad_norm": 0.9031485915184021, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 9600 + }, + { + "epoch": 6.281045751633987, + "grad_norm": 0.9893783926963806, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 9610 + }, + { + "epoch": 6.287581699346405, + "grad_norm": 1.1979725360870361, + "learning_rate": 0.0002, + "loss": 0.9114, + "step": 9620 + }, + { + "epoch": 6.294117647058823, + "grad_norm": 1.380516767501831, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 9630 + }, + { + "epoch": 6.300653594771242, + "grad_norm": 1.1370083093643188, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 9640 + }, + { + "epoch": 6.30718954248366, + "grad_norm": 1.4091558456420898, + "learning_rate": 0.0002, + "loss": 0.9073, + "step": 9650 + }, + { + "epoch": 6.313725490196078, + "grad_norm": 1.0670944452285767, + "learning_rate": 0.0002, + "loss": 0.9096, + "step": 9660 + }, + { + "epoch": 6.3202614379084965, + "grad_norm": 0.9150263667106628, + "learning_rate": 0.0002, + "loss": 0.9376, + "step": 9670 + }, + { + "epoch": 6.326797385620915, + "grad_norm": 1.1342853307724, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 9680 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 1.2733415365219116, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 9690 + }, + { + "epoch": 6.339869281045751, + "grad_norm": 1.3647292852401733, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 9700 + }, + { + "epoch": 6.34640522875817, + "grad_norm": 1.0435094833374023, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 9710 + }, + { + "epoch": 6.352941176470588, + "grad_norm": 1.3641071319580078, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 9720 + }, + { + "epoch": 6.359477124183006, + "grad_norm": 1.2806159257888794, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 9730 + }, + { + "epoch": 6.366013071895424, + "grad_norm": 1.0193076133728027, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 9740 + }, + { + "epoch": 6.372549019607844, + "grad_norm": 1.2349408864974976, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 9750 + }, + { + "epoch": 6.379084967320262, + "grad_norm": 1.2062549591064453, + "learning_rate": 0.0002, + "loss": 0.8837, + "step": 9760 + }, + { + "epoch": 6.38562091503268, + "grad_norm": 1.4402194023132324, + "learning_rate": 0.0002, + "loss": 0.8947, + "step": 9770 + }, + { + "epoch": 6.392156862745098, + "grad_norm": 1.1730891466140747, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 9780 + }, + { + "epoch": 6.398692810457517, + "grad_norm": 1.1481093168258667, + "learning_rate": 0.0002, + "loss": 0.9005, + "step": 9790 + }, + { + "epoch": 6.405228758169935, + "grad_norm": 1.0012723207473755, + "learning_rate": 0.0002, + "loss": 0.9431, + "step": 9800 + }, + { + "epoch": 6.411764705882353, + "grad_norm": 0.8839848041534424, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 9810 + }, + { + "epoch": 6.4183006535947715, + "grad_norm": 1.096693992614746, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 9820 + }, + { + "epoch": 6.42483660130719, + "grad_norm": 1.4713369607925415, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 9830 + }, + { + "epoch": 6.431372549019608, + "grad_norm": 1.2529761791229248, + "learning_rate": 0.0002, + "loss": 0.9563, + "step": 9840 + }, + { + "epoch": 6.437908496732026, + "grad_norm": 1.5575600862503052, + "learning_rate": 0.0002, + "loss": 0.8551, + "step": 9850 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.2188916206359863, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 9860 + }, + { + "epoch": 6.450980392156863, + "grad_norm": 1.1558794975280762, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 9870 + }, + { + "epoch": 6.457516339869281, + "grad_norm": 1.1506937742233276, + "learning_rate": 0.0002, + "loss": 0.8632, + "step": 9880 + }, + { + "epoch": 6.4640522875816995, + "grad_norm": 1.1168335676193237, + "learning_rate": 0.0002, + "loss": 1.0575, + "step": 9890 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 1.192449688911438, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 9900 + }, + { + "epoch": 6.477124183006536, + "grad_norm": 1.0451104640960693, + "learning_rate": 0.0002, + "loss": 0.9478, + "step": 9910 + }, + { + "epoch": 6.483660130718954, + "grad_norm": 1.1111775636672974, + "learning_rate": 0.0002, + "loss": 0.9034, + "step": 9920 + }, + { + "epoch": 6.490196078431373, + "grad_norm": 1.2094531059265137, + "learning_rate": 0.0002, + "loss": 0.8971, + "step": 9930 + }, + { + "epoch": 6.496732026143791, + "grad_norm": 1.0547380447387695, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 9940 + }, + { + "epoch": 6.503267973856209, + "grad_norm": 1.5547202825546265, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 9950 + }, + { + "epoch": 6.509803921568627, + "grad_norm": 1.1917903423309326, + "learning_rate": 0.0002, + "loss": 0.9109, + "step": 9960 + }, + { + "epoch": 6.516339869281046, + "grad_norm": 1.0918153524398804, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 9970 + }, + { + "epoch": 6.522875816993464, + "grad_norm": 1.146968960762024, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 9980 + }, + { + "epoch": 6.529411764705882, + "grad_norm": 0.9899234771728516, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 9990 + }, + { + "epoch": 6.5359477124183005, + "grad_norm": 2.160924196243286, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 10000 + }, + { + "epoch": 6.542483660130719, + "grad_norm": 1.6366891860961914, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 10010 + }, + { + "epoch": 6.549019607843137, + "grad_norm": 0.9876762628555298, + "learning_rate": 0.0002, + "loss": 0.8582, + "step": 10020 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.5622549057006836, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 10030 + }, + { + "epoch": 6.562091503267974, + "grad_norm": 1.0108020305633545, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 10040 + }, + { + "epoch": 6.568627450980392, + "grad_norm": 1.0725725889205933, + "learning_rate": 0.0002, + "loss": 0.9574, + "step": 10050 + }, + { + "epoch": 6.57516339869281, + "grad_norm": 1.1551216840744019, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 10060 + }, + { + "epoch": 6.5816993464052285, + "grad_norm": 1.5174646377563477, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 10070 + }, + { + "epoch": 6.588235294117647, + "grad_norm": 1.041877031326294, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 10080 + }, + { + "epoch": 6.594771241830065, + "grad_norm": 0.9939621686935425, + "learning_rate": 0.0002, + "loss": 0.9684, + "step": 10090 + }, + { + "epoch": 6.601307189542483, + "grad_norm": 1.2706589698791504, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 10100 + }, + { + "epoch": 6.607843137254902, + "grad_norm": 1.1071467399597168, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 10110 + }, + { + "epoch": 6.61437908496732, + "grad_norm": 0.9449541568756104, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 10120 + }, + { + "epoch": 6.620915032679738, + "grad_norm": 1.0961830615997314, + "learning_rate": 0.0002, + "loss": 0.9557, + "step": 10130 + }, + { + "epoch": 6.627450980392156, + "grad_norm": 1.7726300954818726, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 10140 + }, + { + "epoch": 6.633986928104575, + "grad_norm": 1.2345516681671143, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 10150 + }, + { + "epoch": 6.640522875816993, + "grad_norm": 1.2062907218933105, + "learning_rate": 0.0002, + "loss": 0.9573, + "step": 10160 + }, + { + "epoch": 6.647058823529412, + "grad_norm": 1.029327154159546, + "learning_rate": 0.0002, + "loss": 0.918, + "step": 10170 + }, + { + "epoch": 6.65359477124183, + "grad_norm": 1.442307710647583, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 10180 + }, + { + "epoch": 6.660130718954249, + "grad_norm": 1.2579066753387451, + "learning_rate": 0.0002, + "loss": 0.8924, + "step": 10190 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.4563188552856445, + "learning_rate": 0.0002, + "loss": 0.9836, + "step": 10200 + }, + { + "epoch": 6.673202614379085, + "grad_norm": 0.9699450135231018, + "learning_rate": 0.0002, + "loss": 0.8876, + "step": 10210 + }, + { + "epoch": 6.6797385620915035, + "grad_norm": 1.812523603439331, + "learning_rate": 0.0002, + "loss": 0.9589, + "step": 10220 + }, + { + "epoch": 6.686274509803922, + "grad_norm": 1.124000906944275, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 10230 + }, + { + "epoch": 6.69281045751634, + "grad_norm": 1.0957475900650024, + "learning_rate": 0.0002, + "loss": 0.8924, + "step": 10240 + }, + { + "epoch": 6.699346405228758, + "grad_norm": 0.989689826965332, + "learning_rate": 0.0002, + "loss": 0.8891, + "step": 10250 + }, + { + "epoch": 6.705882352941177, + "grad_norm": 1.4353317022323608, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 10260 + }, + { + "epoch": 6.712418300653595, + "grad_norm": 1.0245451927185059, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 10270 + }, + { + "epoch": 6.718954248366013, + "grad_norm": 1.097334861755371, + "learning_rate": 0.0002, + "loss": 0.8814, + "step": 10280 + }, + { + "epoch": 6.7254901960784315, + "grad_norm": 0.982356071472168, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 10290 + }, + { + "epoch": 6.73202614379085, + "grad_norm": 1.8842819929122925, + "learning_rate": 0.0002, + "loss": 0.9909, + "step": 10300 + }, + { + "epoch": 6.738562091503268, + "grad_norm": 0.8648947477340698, + "learning_rate": 0.0002, + "loss": 0.9286, + "step": 10310 + }, + { + "epoch": 6.745098039215686, + "grad_norm": 1.1510577201843262, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 10320 + }, + { + "epoch": 6.751633986928105, + "grad_norm": 1.874495506286621, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 10330 + }, + { + "epoch": 6.758169934640523, + "grad_norm": 1.1126408576965332, + "learning_rate": 0.0002, + "loss": 0.8914, + "step": 10340 + }, + { + "epoch": 6.764705882352941, + "grad_norm": 1.6654644012451172, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 10350 + }, + { + "epoch": 6.771241830065359, + "grad_norm": 1.0699580907821655, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 10360 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.9460757374763489, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 10370 + }, + { + "epoch": 6.784313725490196, + "grad_norm": 1.2553058862686157, + "learning_rate": 0.0002, + "loss": 0.9589, + "step": 10380 + }, + { + "epoch": 6.790849673202614, + "grad_norm": 1.0939891338348389, + "learning_rate": 0.0002, + "loss": 0.8782, + "step": 10390 + }, + { + "epoch": 6.7973856209150325, + "grad_norm": 1.0647451877593994, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 10400 + }, + { + "epoch": 6.803921568627451, + "grad_norm": 1.0954521894454956, + "learning_rate": 0.0002, + "loss": 0.9478, + "step": 10410 + }, + { + "epoch": 6.810457516339869, + "grad_norm": 1.4371392726898193, + "learning_rate": 0.0002, + "loss": 1.0385, + "step": 10420 + }, + { + "epoch": 6.816993464052287, + "grad_norm": 1.0063464641571045, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 10430 + }, + { + "epoch": 6.823529411764706, + "grad_norm": 1.5189263820648193, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 10440 + }, + { + "epoch": 6.830065359477124, + "grad_norm": 0.9715501070022583, + "learning_rate": 0.0002, + "loss": 0.9246, + "step": 10450 + }, + { + "epoch": 6.836601307189542, + "grad_norm": 1.114586353302002, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 10460 + }, + { + "epoch": 6.8431372549019605, + "grad_norm": 1.2991431951522827, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 10470 + }, + { + "epoch": 6.849673202614379, + "grad_norm": 1.203114628791809, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 10480 + }, + { + "epoch": 6.856209150326797, + "grad_norm": 1.476167917251587, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 10490 + }, + { + "epoch": 6.862745098039216, + "grad_norm": 1.0933326482772827, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 10500 + }, + { + "epoch": 6.8692810457516345, + "grad_norm": 1.2831504344940186, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 10510 + }, + { + "epoch": 6.875816993464053, + "grad_norm": 1.1967637538909912, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 10520 + }, + { + "epoch": 6.882352941176471, + "grad_norm": 1.1276888847351074, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 10530 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.2680490016937256, + "learning_rate": 0.0002, + "loss": 0.9568, + "step": 10540 + }, + { + "epoch": 6.895424836601308, + "grad_norm": 1.5469038486480713, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 10550 + }, + { + "epoch": 6.901960784313726, + "grad_norm": 1.1731038093566895, + "learning_rate": 0.0002, + "loss": 0.8545, + "step": 10560 + }, + { + "epoch": 6.908496732026144, + "grad_norm": 0.968008816242218, + "learning_rate": 0.0002, + "loss": 0.9795, + "step": 10570 + }, + { + "epoch": 6.915032679738562, + "grad_norm": 0.9082416892051697, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 10580 + }, + { + "epoch": 6.921568627450981, + "grad_norm": 1.5816899538040161, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 10590 + }, + { + "epoch": 6.928104575163399, + "grad_norm": 0.9462234377861023, + "learning_rate": 0.0002, + "loss": 0.9692, + "step": 10600 + }, + { + "epoch": 6.934640522875817, + "grad_norm": 1.4950200319290161, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 10610 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 1.2929182052612305, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 10620 + }, + { + "epoch": 6.947712418300654, + "grad_norm": 1.2995754480361938, + "learning_rate": 0.0002, + "loss": 1.0141, + "step": 10630 + }, + { + "epoch": 6.954248366013072, + "grad_norm": 0.9407122135162354, + "learning_rate": 0.0002, + "loss": 0.9863, + "step": 10640 + }, + { + "epoch": 6.96078431372549, + "grad_norm": 1.1735378503799438, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 10650 + }, + { + "epoch": 6.967320261437909, + "grad_norm": 0.9937344193458557, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 10660 + }, + { + "epoch": 6.973856209150327, + "grad_norm": 1.2498728036880493, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 10670 + }, + { + "epoch": 6.980392156862745, + "grad_norm": 1.0513341426849365, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 10680 + }, + { + "epoch": 6.9869281045751634, + "grad_norm": 1.4611467123031616, + "learning_rate": 0.0002, + "loss": 0.9259, + "step": 10690 + }, + { + "epoch": 6.993464052287582, + "grad_norm": 1.2924799919128418, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 10700 + }, + { + "epoch": 7.0, + "grad_norm": 1.2024929523468018, + "learning_rate": 0.0002, + "loss": 0.8953, + "step": 10710 + }, + { + "epoch": 7.0, + "eval_loss": 1.4972445964813232, + "eval_runtime": 33.6225, + "eval_samples_per_second": 12.967, + "eval_steps_per_second": 1.636, + "step": 10710 + }, + { + "epoch": 7.006535947712418, + "grad_norm": 1.1302162408828735, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 10720 + }, + { + "epoch": 7.0130718954248366, + "grad_norm": 1.2731552124023438, + "learning_rate": 0.0002, + "loss": 0.8584, + "step": 10730 + }, + { + "epoch": 7.019607843137255, + "grad_norm": 1.2694480419158936, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10740 + }, + { + "epoch": 7.026143790849673, + "grad_norm": 1.1517360210418701, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10750 + }, + { + "epoch": 7.032679738562091, + "grad_norm": 1.3649171590805054, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 10760 + }, + { + "epoch": 7.03921568627451, + "grad_norm": 1.1630656719207764, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 10770 + }, + { + "epoch": 7.045751633986928, + "grad_norm": 1.2658313512802124, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 10780 + }, + { + "epoch": 7.052287581699346, + "grad_norm": 1.5004769563674927, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 10790 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 1.052678108215332, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 10800 + }, + { + "epoch": 7.065359477124183, + "grad_norm": 1.3461277484893799, + "learning_rate": 0.0002, + "loss": 0.8613, + "step": 10810 + }, + { + "epoch": 7.071895424836601, + "grad_norm": 1.3074769973754883, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10820 + }, + { + "epoch": 7.078431372549019, + "grad_norm": 1.8454785346984863, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 10830 + }, + { + "epoch": 7.084967320261438, + "grad_norm": 0.9786653518676758, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10840 + }, + { + "epoch": 7.091503267973856, + "grad_norm": 1.2760838270187378, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10850 + }, + { + "epoch": 7.098039215686274, + "grad_norm": 1.1340841054916382, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 10860 + }, + { + "epoch": 7.104575163398692, + "grad_norm": 1.3808159828186035, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 10870 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 1.147668719291687, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 10880 + }, + { + "epoch": 7.117647058823529, + "grad_norm": 1.3183035850524902, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 10890 + }, + { + "epoch": 7.124183006535947, + "grad_norm": 1.3882936239242554, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10900 + }, + { + "epoch": 7.130718954248366, + "grad_norm": 0.9495398998260498, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10910 + }, + { + "epoch": 7.137254901960785, + "grad_norm": 1.3810124397277832, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 10920 + }, + { + "epoch": 7.143790849673203, + "grad_norm": 1.563207745552063, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 10930 + }, + { + "epoch": 7.150326797385621, + "grad_norm": 1.2633056640625, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 10940 + }, + { + "epoch": 7.1568627450980395, + "grad_norm": 1.2398860454559326, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 10950 + }, + { + "epoch": 7.163398692810458, + "grad_norm": 1.166763186454773, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 10960 + }, + { + "epoch": 7.169934640522876, + "grad_norm": 1.5083234310150146, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 10970 + }, + { + "epoch": 7.176470588235294, + "grad_norm": 1.6927601099014282, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 10980 + }, + { + "epoch": 7.183006535947713, + "grad_norm": 1.090780258178711, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 10990 + }, + { + "epoch": 7.189542483660131, + "grad_norm": 1.0077793598175049, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 11000 + }, + { + "epoch": 7.196078431372549, + "grad_norm": 1.8293051719665527, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 11010 + }, + { + "epoch": 7.2026143790849675, + "grad_norm": 1.0761457681655884, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 11020 + }, + { + "epoch": 7.209150326797386, + "grad_norm": 1.0681469440460205, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 11030 + }, + { + "epoch": 7.215686274509804, + "grad_norm": 1.961199402809143, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 11040 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 1.3750165700912476, + "learning_rate": 0.0002, + "loss": 0.8631, + "step": 11050 + }, + { + "epoch": 7.228758169934641, + "grad_norm": 1.647005319595337, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 11060 + }, + { + "epoch": 7.235294117647059, + "grad_norm": 1.1073668003082275, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 11070 + }, + { + "epoch": 7.241830065359477, + "grad_norm": 1.450289011001587, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 11080 + }, + { + "epoch": 7.248366013071895, + "grad_norm": 1.191163420677185, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 11090 + }, + { + "epoch": 7.254901960784314, + "grad_norm": 1.6975404024124146, + "learning_rate": 0.0002, + "loss": 0.8331, + "step": 11100 + }, + { + "epoch": 7.261437908496732, + "grad_norm": 1.159091830253601, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 11110 + }, + { + "epoch": 7.26797385620915, + "grad_norm": 0.9952927827835083, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 11120 + }, + { + "epoch": 7.2745098039215685, + "grad_norm": 1.4122034311294556, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 11130 + }, + { + "epoch": 7.281045751633987, + "grad_norm": 1.4299325942993164, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 11140 + }, + { + "epoch": 7.287581699346405, + "grad_norm": 1.26812744140625, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 11150 + }, + { + "epoch": 7.294117647058823, + "grad_norm": 1.0740736722946167, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11160 + }, + { + "epoch": 7.300653594771242, + "grad_norm": 1.1293542385101318, + "learning_rate": 0.0002, + "loss": 0.9005, + "step": 11170 + }, + { + "epoch": 7.30718954248366, + "grad_norm": 1.3161042928695679, + "learning_rate": 0.0002, + "loss": 0.8775, + "step": 11180 + }, + { + "epoch": 7.313725490196078, + "grad_norm": 1.5637391805648804, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 11190 + }, + { + "epoch": 7.3202614379084965, + "grad_norm": 1.3164077997207642, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 11200 + }, + { + "epoch": 7.326797385620915, + "grad_norm": 0.9268870949745178, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 11210 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 1.164515733718872, + "learning_rate": 0.0002, + "loss": 0.8719, + "step": 11220 + }, + { + "epoch": 7.339869281045751, + "grad_norm": 1.6878753900527954, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 11230 + }, + { + "epoch": 7.34640522875817, + "grad_norm": 1.1870672702789307, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 11240 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 1.2923716306686401, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 11250 + }, + { + "epoch": 7.359477124183006, + "grad_norm": 1.2006791830062866, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 11260 + }, + { + "epoch": 7.366013071895424, + "grad_norm": 1.0424097776412964, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 11270 + }, + { + "epoch": 7.372549019607844, + "grad_norm": 1.349094033241272, + "learning_rate": 0.0002, + "loss": 0.9427, + "step": 11280 + }, + { + "epoch": 7.379084967320262, + "grad_norm": 1.4128005504608154, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 11290 + }, + { + "epoch": 7.38562091503268, + "grad_norm": 1.3647041320800781, + "learning_rate": 0.0002, + "loss": 0.9219, + "step": 11300 + }, + { + "epoch": 7.392156862745098, + "grad_norm": 1.0561704635620117, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 11310 + }, + { + "epoch": 7.398692810457517, + "grad_norm": 1.2405760288238525, + "learning_rate": 0.0002, + "loss": 0.9151, + "step": 11320 + }, + { + "epoch": 7.405228758169935, + "grad_norm": 1.0932328701019287, + "learning_rate": 0.0002, + "loss": 0.9108, + "step": 11330 + }, + { + "epoch": 7.411764705882353, + "grad_norm": 1.206778883934021, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 11340 + }, + { + "epoch": 7.4183006535947715, + "grad_norm": 1.5261255502700806, + "learning_rate": 0.0002, + "loss": 0.9062, + "step": 11350 + }, + { + "epoch": 7.42483660130719, + "grad_norm": 1.4928070306777954, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 11360 + }, + { + "epoch": 7.431372549019608, + "grad_norm": 1.331190586090088, + "learning_rate": 0.0002, + "loss": 0.9027, + "step": 11370 + }, + { + "epoch": 7.437908496732026, + "grad_norm": 1.0745981931686401, + "learning_rate": 0.0002, + "loss": 0.8547, + "step": 11380 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 1.3070036172866821, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 11390 + }, + { + "epoch": 7.450980392156863, + "grad_norm": 0.8743805885314941, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 11400 + }, + { + "epoch": 7.457516339869281, + "grad_norm": 1.2747994661331177, + "learning_rate": 0.0002, + "loss": 0.9331, + "step": 11410 + }, + { + "epoch": 7.4640522875816995, + "grad_norm": 1.3688995838165283, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 11420 + }, + { + "epoch": 7.470588235294118, + "grad_norm": 1.1788195371627808, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11430 + }, + { + "epoch": 7.477124183006536, + "grad_norm": 2.0186705589294434, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 11440 + }, + { + "epoch": 7.483660130718954, + "grad_norm": 1.1707696914672852, + "learning_rate": 0.0002, + "loss": 0.9182, + "step": 11450 + }, + { + "epoch": 7.490196078431373, + "grad_norm": 1.26426100730896, + "learning_rate": 0.0002, + "loss": 0.9019, + "step": 11460 + }, + { + "epoch": 7.496732026143791, + "grad_norm": 1.2673691511154175, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 11470 + }, + { + "epoch": 7.503267973856209, + "grad_norm": 1.038956642150879, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 11480 + }, + { + "epoch": 7.509803921568627, + "grad_norm": 1.216252326965332, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11490 + }, + { + "epoch": 7.516339869281046, + "grad_norm": 1.1520167589187622, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 11500 + }, + { + "epoch": 7.522875816993464, + "grad_norm": 1.3962451219558716, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 11510 + }, + { + "epoch": 7.529411764705882, + "grad_norm": 1.2226953506469727, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 11520 + }, + { + "epoch": 7.5359477124183005, + "grad_norm": 1.2891474962234497, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 11530 + }, + { + "epoch": 7.542483660130719, + "grad_norm": 1.3372766971588135, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 11540 + }, + { + "epoch": 7.549019607843137, + "grad_norm": 1.4196370840072632, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11550 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 1.0041396617889404, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 11560 + }, + { + "epoch": 7.562091503267974, + "grad_norm": 1.3470606803894043, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 11570 + }, + { + "epoch": 7.568627450980392, + "grad_norm": 1.1738601922988892, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 11580 + }, + { + "epoch": 7.57516339869281, + "grad_norm": 1.1629133224487305, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 11590 + }, + { + "epoch": 7.5816993464052285, + "grad_norm": 1.2859786748886108, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 11600 + }, + { + "epoch": 7.588235294117647, + "grad_norm": 1.429398775100708, + "learning_rate": 0.0002, + "loss": 0.8356, + "step": 11610 + }, + { + "epoch": 7.594771241830065, + "grad_norm": 1.3300801515579224, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 11620 + }, + { + "epoch": 7.601307189542483, + "grad_norm": 1.3261592388153076, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 11630 + }, + { + "epoch": 7.607843137254902, + "grad_norm": 1.8779925107955933, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 11640 + }, + { + "epoch": 7.61437908496732, + "grad_norm": 1.7839158773422241, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 11650 + }, + { + "epoch": 7.620915032679738, + "grad_norm": 1.6469435691833496, + "learning_rate": 0.0002, + "loss": 0.9163, + "step": 11660 + }, + { + "epoch": 7.627450980392156, + "grad_norm": 1.5416018962860107, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 11670 + }, + { + "epoch": 7.633986928104575, + "grad_norm": 1.5173335075378418, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 11680 + }, + { + "epoch": 7.640522875816993, + "grad_norm": 1.1372658014297485, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 11690 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 1.233030915260315, + "learning_rate": 0.0002, + "loss": 0.8852, + "step": 11700 + }, + { + "epoch": 7.65359477124183, + "grad_norm": 1.3100069761276245, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 11710 + }, + { + "epoch": 7.660130718954249, + "grad_norm": 0.9770023226737976, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 11720 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 1.240946650505066, + "learning_rate": 0.0002, + "loss": 0.8658, + "step": 11730 + }, + { + "epoch": 7.673202614379085, + "grad_norm": 1.444226861000061, + "learning_rate": 0.0002, + "loss": 0.9527, + "step": 11740 + }, + { + "epoch": 7.6797385620915035, + "grad_norm": 1.2667231559753418, + "learning_rate": 0.0002, + "loss": 0.8448, + "step": 11750 + }, + { + "epoch": 7.686274509803922, + "grad_norm": 1.340754747390747, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 11760 + }, + { + "epoch": 7.69281045751634, + "grad_norm": 1.181988000869751, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 11770 + }, + { + "epoch": 7.699346405228758, + "grad_norm": 1.0623301267623901, + "learning_rate": 0.0002, + "loss": 0.8609, + "step": 11780 + }, + { + "epoch": 7.705882352941177, + "grad_norm": 1.1917353868484497, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 11790 + }, + { + "epoch": 7.712418300653595, + "grad_norm": 1.7202110290527344, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 11800 + }, + { + "epoch": 7.718954248366013, + "grad_norm": 1.1121439933776855, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 11810 + }, + { + "epoch": 7.7254901960784315, + "grad_norm": 0.956794261932373, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 11820 + }, + { + "epoch": 7.73202614379085, + "grad_norm": 1.2524380683898926, + "learning_rate": 0.0002, + "loss": 0.9056, + "step": 11830 + }, + { + "epoch": 7.738562091503268, + "grad_norm": 1.1095308065414429, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 11840 + }, + { + "epoch": 7.745098039215686, + "grad_norm": 1.631195068359375, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 11850 + }, + { + "epoch": 7.751633986928105, + "grad_norm": 1.2265965938568115, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 11860 + }, + { + "epoch": 7.758169934640523, + "grad_norm": 1.080328106880188, + "learning_rate": 0.0002, + "loss": 0.8875, + "step": 11870 + }, + { + "epoch": 7.764705882352941, + "grad_norm": 1.5570356845855713, + "learning_rate": 0.0002, + "loss": 0.8732, + "step": 11880 + }, + { + "epoch": 7.771241830065359, + "grad_norm": 1.3791661262512207, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 11890 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 1.1457891464233398, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 11900 + }, + { + "epoch": 7.784313725490196, + "grad_norm": 1.6357585191726685, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 11910 + }, + { + "epoch": 7.790849673202614, + "grad_norm": 1.1845953464508057, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 11920 + }, + { + "epoch": 7.7973856209150325, + "grad_norm": 1.2255016565322876, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 11930 + }, + { + "epoch": 7.803921568627451, + "grad_norm": 1.2113513946533203, + "learning_rate": 0.0002, + "loss": 0.913, + "step": 11940 + }, + { + "epoch": 7.810457516339869, + "grad_norm": 1.0834609270095825, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 11950 + }, + { + "epoch": 7.816993464052287, + "grad_norm": 1.0127689838409424, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 11960 + }, + { + "epoch": 7.823529411764706, + "grad_norm": 1.1124000549316406, + "learning_rate": 0.0002, + "loss": 0.9468, + "step": 11970 + }, + { + "epoch": 7.830065359477124, + "grad_norm": 1.3440804481506348, + "learning_rate": 0.0002, + "loss": 0.8345, + "step": 11980 + }, + { + "epoch": 7.836601307189542, + "grad_norm": 1.8478741645812988, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 11990 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 1.1202499866485596, + "learning_rate": 0.0002, + "loss": 0.9708, + "step": 12000 + }, + { + "epoch": 7.849673202614379, + "grad_norm": 1.735700249671936, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 12010 + }, + { + "epoch": 7.856209150326797, + "grad_norm": 1.2994014024734497, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 12020 + }, + { + "epoch": 7.862745098039216, + "grad_norm": 1.8655444383621216, + "learning_rate": 0.0002, + "loss": 0.8656, + "step": 12030 + }, + { + "epoch": 7.8692810457516345, + "grad_norm": 1.0460877418518066, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 12040 + }, + { + "epoch": 7.875816993464053, + "grad_norm": 1.5241339206695557, + "learning_rate": 0.0002, + "loss": 0.8603, + "step": 12050 + }, + { + "epoch": 7.882352941176471, + "grad_norm": 1.171849250793457, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 12060 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 1.0957022905349731, + "learning_rate": 0.0002, + "loss": 0.8577, + "step": 12070 + }, + { + "epoch": 7.895424836601308, + "grad_norm": 1.4121248722076416, + "learning_rate": 0.0002, + "loss": 0.9212, + "step": 12080 + }, + { + "epoch": 7.901960784313726, + "grad_norm": 1.3393208980560303, + "learning_rate": 0.0002, + "loss": 1.0002, + "step": 12090 + }, + { + "epoch": 7.908496732026144, + "grad_norm": 1.1252245903015137, + "learning_rate": 0.0002, + "loss": 0.8959, + "step": 12100 + }, + { + "epoch": 7.915032679738562, + "grad_norm": 1.4131813049316406, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 12110 + }, + { + "epoch": 7.921568627450981, + "grad_norm": 1.2392992973327637, + "learning_rate": 0.0002, + "loss": 0.9106, + "step": 12120 + }, + { + "epoch": 7.928104575163399, + "grad_norm": 1.3233672380447388, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 12130 + }, + { + "epoch": 7.934640522875817, + "grad_norm": 1.2547026872634888, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 12140 + }, + { + "epoch": 7.9411764705882355, + "grad_norm": 1.1143239736557007, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 12150 + }, + { + "epoch": 7.947712418300654, + "grad_norm": 1.030006766319275, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12160 + }, + { + "epoch": 7.954248366013072, + "grad_norm": 1.1070104837417603, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 12170 + }, + { + "epoch": 7.96078431372549, + "grad_norm": 1.3011643886566162, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 12180 + }, + { + "epoch": 7.967320261437909, + "grad_norm": 1.134848713874817, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 12190 + }, + { + "epoch": 7.973856209150327, + "grad_norm": 1.7021794319152832, + "learning_rate": 0.0002, + "loss": 0.9318, + "step": 12200 + }, + { + "epoch": 7.980392156862745, + "grad_norm": 1.0190330743789673, + "learning_rate": 0.0002, + "loss": 0.9159, + "step": 12210 + }, + { + "epoch": 7.9869281045751634, + "grad_norm": 1.6083006858825684, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 12220 + }, + { + "epoch": 7.993464052287582, + "grad_norm": 0.8929536938667297, + "learning_rate": 0.0002, + "loss": 0.915, + "step": 12230 + }, + { + "epoch": 8.0, + "grad_norm": 0.9928004145622253, + "learning_rate": 0.0002, + "loss": 0.8706, + "step": 12240 + }, + { + "epoch": 8.0, + "eval_loss": 1.5263545513153076, + "eval_runtime": 33.6089, + "eval_samples_per_second": 12.973, + "eval_steps_per_second": 1.636, + "step": 12240 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4959723514167296e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-12240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4306725fab3f86257413add00072b452f5b8e372 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f778dcff85f98db7eeda468dace1f52104faf8349ee5c62b0084a871aeab7db5 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4a22974d92a2f259ec999275da99537188677ed --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:055b61904899d8c245e186619c3ee84593b4e93b7250d5443736b1ea7ae816c8 +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fbc4dbfb6f1d07cc9580cb72fab1022550017edf --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f430b73c5984667b50d1408aabb054172543e00726abc905e82bd691bb3ed14 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..12cf840f119983494dc57dcded908b0a8a439bd4 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70fd0e651657c68255bfbddaea06e66fc5308abd761799251251b4b129a8c90d +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0e8f96a945315844fd7ee0f6d67a270e120d4c2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/trainer_state.json @@ -0,0 +1,1112 @@ +{ + "best_metric": 1.4715123176574707, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 1530, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.869965439270912e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..542e836189e0394a10bf371313fbebcaf90101d9 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da6c2b53ce5155b34304e447a4ffda48feee6f707478ea05cbbc61bf87bee38d +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..303805891fb1481cdbbf73f8b79bbfe4b19ba6a3 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a545166f9c84bea18e36eff1c076f0eb9f0cdecace8ff6bfde27df9751de2a +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7876af9ab6f2d52f56d1b3311ccf18f023466fa --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee770ffc0dc82e97c84094c424d42999df56b9c94b5bb2694bb76f9c86374b5 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb86c1901ca58fffe630b9be02cf95d9158d1e63 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c601eee32e7508f647875ff3375d30f0d39da2c21bb1297b36d7da7ee8d423c0 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8bf67207ff92467519d066c4cea922cc8ce70012 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/trainer_state.json @@ -0,0 +1,2191 @@ +{ + "best_metric": 1.4276371002197266, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 3060, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.739930878541824e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ac84c222b4db7e08b3a542e902c817c21e78452 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35003eda35920d58097bfa5b4a6bb46aece58f29cfbfc46af1f150b8e4bb942b +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..393ef27287048b1a7957d989bd487390fade0a27 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4b257e5f9941c554b614aacbe478f3e231b8140ed9e4e435b36137f35e0d13 +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ba248fd3fb977b0cf0facc03a2cb1f3487feea3 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf91e502b853d3b5a87bb99bdf09477e284e18af1eb51f31bb51f5b038e5c17 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..062f5f1ebaf4d2445357abb82b2270ca561fa77d --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:390683d95e3073976e31c984bd808b3e2c9a840f1f23bfdd2e7ab3e19fa5c49f +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e56e4dad19cb386005d50380f15e2106717beef --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/trainer_state.json @@ -0,0 +1,3270 @@ +{ + "best_metric": 1.4131312370300293, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 4590, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.609896317812736e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a25de9298ae67ee0c6a132a503d40bed8ca8c54 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e46139fbdbe7c9dba8da0dc7642049d7fa284650682279a059b4fd5f0e13ba +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6d918a481b717f232ee93f55effbcd545f64e2b --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6245e8d34d6bddc9e342779575e283d303ea9f42c30bb851955babfbdc58fbe +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4dd5144f46607e90c6f38c8be1df28fdc445669e --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0014af00868b92f66ddbd9e54f88c848b226c17b47294e56eadd924245e83166 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c98e036e7354dc6e66ea46ccbcc385851151283f --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1ba7db4a140f522050ecce2a409dc22f2e6b188c5b8eac264d1c306ae2a65f4 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f56d642e7625a6eda9f34f9583ec3e2acf0919d3 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/trainer_state.json @@ -0,0 +1,4349 @@ +{ + "best_metric": 1.4113320112228394, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 6120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + }, + { + "epoch": 3.0065359477124183, + "grad_norm": 0.6926420331001282, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4600 + }, + { + "epoch": 3.0130718954248366, + "grad_norm": 0.6656355857849121, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 4610 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.9901936650276184, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 4620 + }, + { + "epoch": 3.026143790849673, + "grad_norm": 0.6713474988937378, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 4630 + }, + { + "epoch": 3.0326797385620914, + "grad_norm": 0.6199324131011963, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 4640 + }, + { + "epoch": 3.0392156862745097, + "grad_norm": 0.7180785536766052, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 4650 + }, + { + "epoch": 3.045751633986928, + "grad_norm": 0.8256588578224182, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 4660 + }, + { + "epoch": 3.052287581699346, + "grad_norm": 0.6637389063835144, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 4670 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.6980698108673096, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 4680 + }, + { + "epoch": 3.065359477124183, + "grad_norm": 0.8091534972190857, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 4690 + }, + { + "epoch": 3.0718954248366015, + "grad_norm": 0.5715174078941345, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 4700 + }, + { + "epoch": 3.0784313725490198, + "grad_norm": 0.735639750957489, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 4710 + }, + { + "epoch": 3.084967320261438, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.0002, + "loss": 1.1522, + "step": 4720 + }, + { + "epoch": 3.0915032679738563, + "grad_norm": 1.263566017150879, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 4730 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.6600871682167053, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4740 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 0.717792809009552, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 4750 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.853714644908905, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 4760 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.1004153490066528, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 4770 + }, + { + "epoch": 3.1241830065359477, + "grad_norm": 0.8566235899925232, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 4780 + }, + { + "epoch": 3.130718954248366, + "grad_norm": 0.8315296173095703, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 4790 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8020524978637695, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 4800 + }, + { + "epoch": 3.1437908496732025, + "grad_norm": 0.7564275860786438, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 4810 + }, + { + "epoch": 3.150326797385621, + "grad_norm": 0.9077776670455933, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 4820 + }, + { + "epoch": 3.156862745098039, + "grad_norm": 0.6323099732398987, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 4830 + }, + { + "epoch": 3.1633986928104574, + "grad_norm": 0.6625368595123291, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 3.1699346405228757, + "grad_norm": 0.8119261860847473, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 4850 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.6399450898170471, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 4860 + }, + { + "epoch": 3.183006535947712, + "grad_norm": 1.0659016370773315, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 4870 + }, + { + "epoch": 3.189542483660131, + "grad_norm": 0.8040369749069214, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4880 + }, + { + "epoch": 3.196078431372549, + "grad_norm": 0.7784733176231384, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4890 + }, + { + "epoch": 3.2026143790849675, + "grad_norm": 0.9660294651985168, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4900 + }, + { + "epoch": 3.2091503267973858, + "grad_norm": 1.0676977634429932, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 4910 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.5877565741539001, + "learning_rate": 0.0002, + "loss": 1.0083, + "step": 4920 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.6164032816886902, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 4930 + }, + { + "epoch": 3.2287581699346406, + "grad_norm": 0.7627606987953186, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 4940 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 0.7442803978919983, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 4950 + }, + { + "epoch": 3.241830065359477, + "grad_norm": 0.7277812361717224, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 4960 + }, + { + "epoch": 3.2483660130718954, + "grad_norm": 1.0301902294158936, + "learning_rate": 0.0002, + "loss": 1.2237, + "step": 4970 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.7798232436180115, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 4980 + }, + { + "epoch": 3.261437908496732, + "grad_norm": 1.210265874862671, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 4990 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 0.6677713990211487, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 5000 + }, + { + "epoch": 3.2745098039215685, + "grad_norm": 1.0524500608444214, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 5010 + }, + { + "epoch": 3.281045751633987, + "grad_norm": 0.7091745734214783, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5020 + }, + { + "epoch": 3.287581699346405, + "grad_norm": 0.8523224592208862, + "learning_rate": 0.0002, + "loss": 1.1891, + "step": 5030 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.6120608448982239, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 5040 + }, + { + "epoch": 3.3006535947712417, + "grad_norm": 0.7437472939491272, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 5050 + }, + { + "epoch": 3.30718954248366, + "grad_norm": 0.7611715197563171, + "learning_rate": 0.0002, + "loss": 1.1295, + "step": 5060 + }, + { + "epoch": 3.313725490196078, + "grad_norm": 0.7249704003334045, + "learning_rate": 0.0002, + "loss": 1.0531, + "step": 5070 + }, + { + "epoch": 3.3202614379084965, + "grad_norm": 0.7316247820854187, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5080 + }, + { + "epoch": 3.326797385620915, + "grad_norm": 0.562412440776825, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5090 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7052176594734192, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5100 + }, + { + "epoch": 3.3398692810457518, + "grad_norm": 0.7714211344718933, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 5110 + }, + { + "epoch": 3.34640522875817, + "grad_norm": 1.0436055660247803, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5120 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.8867271542549133, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 5130 + }, + { + "epoch": 3.3594771241830066, + "grad_norm": 0.8371267914772034, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 5140 + }, + { + "epoch": 3.366013071895425, + "grad_norm": 0.7257837057113647, + "learning_rate": 0.0002, + "loss": 1.1073, + "step": 5150 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.7102002501487732, + "learning_rate": 0.0002, + "loss": 1.1162, + "step": 5160 + }, + { + "epoch": 3.3790849673202614, + "grad_norm": 0.7636350393295288, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 5170 + }, + { + "epoch": 3.3856209150326797, + "grad_norm": 0.6887359619140625, + "learning_rate": 0.0002, + "loss": 1.0708, + "step": 5180 + }, + { + "epoch": 3.392156862745098, + "grad_norm": 0.8141424655914307, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 5190 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.694423496723175, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5200 + }, + { + "epoch": 3.4052287581699345, + "grad_norm": 0.914013683795929, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5210 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.8503239750862122, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 5220 + }, + { + "epoch": 3.418300653594771, + "grad_norm": 0.6196836233139038, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 5230 + }, + { + "epoch": 3.4248366013071894, + "grad_norm": 1.0760811567306519, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 5240 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 0.6524698138237, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 5250 + }, + { + "epoch": 3.4379084967320264, + "grad_norm": 0.674467921257019, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5260 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7690372467041016, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 5270 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.8751813769340515, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 5280 + }, + { + "epoch": 3.457516339869281, + "grad_norm": 0.750407874584198, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 5290 + }, + { + "epoch": 3.4640522875816995, + "grad_norm": 0.5991823077201843, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 5300 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.0164772272109985, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 5310 + }, + { + "epoch": 3.477124183006536, + "grad_norm": 0.8704105019569397, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 5320 + }, + { + "epoch": 3.4836601307189543, + "grad_norm": 0.709102213382721, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 5330 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.6273632049560547, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 5340 + }, + { + "epoch": 3.496732026143791, + "grad_norm": 0.6807359457015991, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 5350 + }, + { + "epoch": 3.503267973856209, + "grad_norm": 0.7085188627243042, + "learning_rate": 0.0002, + "loss": 1.131, + "step": 5360 + }, + { + "epoch": 3.5098039215686274, + "grad_norm": 0.6938307881355286, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5370 + }, + { + "epoch": 3.5163398692810457, + "grad_norm": 0.8544146418571472, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 5380 + }, + { + "epoch": 3.522875816993464, + "grad_norm": 0.7889642119407654, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 5390 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.7858421206474304, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 5400 + }, + { + "epoch": 3.5359477124183005, + "grad_norm": 0.8547123074531555, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5410 + }, + { + "epoch": 3.542483660130719, + "grad_norm": 0.8218181133270264, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 5420 + }, + { + "epoch": 3.549019607843137, + "grad_norm": 1.153623342514038, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 5430 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.1321099996566772, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 5440 + }, + { + "epoch": 3.5620915032679736, + "grad_norm": 0.9495334029197693, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 5450 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.8743821978569031, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 5460 + }, + { + "epoch": 3.57516339869281, + "grad_norm": 0.7513086795806885, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 5470 + }, + { + "epoch": 3.581699346405229, + "grad_norm": 1.0139480829238892, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 5480 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 0.6615135073661804, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 5490 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 1.180798888206482, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 5500 + }, + { + "epoch": 3.6013071895424837, + "grad_norm": 0.7085279226303101, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 5510 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.540268063545227, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 5520 + }, + { + "epoch": 3.6143790849673203, + "grad_norm": 0.7905671000480652, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 5530 + }, + { + "epoch": 3.6209150326797386, + "grad_norm": 0.8457717299461365, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 5540 + }, + { + "epoch": 3.627450980392157, + "grad_norm": 0.7102677822113037, + "learning_rate": 0.0002, + "loss": 1.1799, + "step": 5550 + }, + { + "epoch": 3.633986928104575, + "grad_norm": 0.7179514765739441, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 5560 + }, + { + "epoch": 3.6405228758169934, + "grad_norm": 1.0854148864746094, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 5570 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.8209951519966125, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5580 + }, + { + "epoch": 3.65359477124183, + "grad_norm": 0.6944138407707214, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 5590 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.7675473093986511, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 5600 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 5610 + }, + { + "epoch": 3.6732026143790852, + "grad_norm": 0.7920727133750916, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 5620 + }, + { + "epoch": 3.6797385620915035, + "grad_norm": 0.9440218806266785, + "learning_rate": 0.0002, + "loss": 1.2287, + "step": 5630 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.6600824594497681, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 5640 + }, + { + "epoch": 3.69281045751634, + "grad_norm": 0.6860619187355042, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 5650 + }, + { + "epoch": 3.6993464052287583, + "grad_norm": 0.6579713225364685, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 5660 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.661081075668335, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 5670 + }, + { + "epoch": 3.712418300653595, + "grad_norm": 1.0968825817108154, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 5680 + }, + { + "epoch": 3.718954248366013, + "grad_norm": 0.8066844940185547, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 5690 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.8341682553291321, + "learning_rate": 0.0002, + "loss": 1.2322, + "step": 5700 + }, + { + "epoch": 3.7320261437908497, + "grad_norm": 0.6682852506637573, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 5710 + }, + { + "epoch": 3.738562091503268, + "grad_norm": 0.898595929145813, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 5720 + }, + { + "epoch": 3.7450980392156863, + "grad_norm": 0.6876054406166077, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 5730 + }, + { + "epoch": 3.7516339869281046, + "grad_norm": 0.7817103266716003, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 5740 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 0.5840168595314026, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 5750 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.6263918876647949, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5760 + }, + { + "epoch": 3.7712418300653594, + "grad_norm": 0.7948952317237854, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 5770 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6700998544692993, + "learning_rate": 0.0002, + "loss": 1.149, + "step": 5780 + }, + { + "epoch": 3.784313725490196, + "grad_norm": 1.1169519424438477, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 5790 + }, + { + "epoch": 3.7908496732026142, + "grad_norm": 0.8354471325874329, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5800 + }, + { + "epoch": 3.7973856209150325, + "grad_norm": 0.6304181814193726, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 5810 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.6919655799865723, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 5820 + }, + { + "epoch": 3.810457516339869, + "grad_norm": 0.600385844707489, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5830 + }, + { + "epoch": 3.8169934640522873, + "grad_norm": 0.8406319618225098, + "learning_rate": 0.0002, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 0.7594282031059265, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5850 + }, + { + "epoch": 3.8300653594771243, + "grad_norm": 0.8179879784584045, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 5860 + }, + { + "epoch": 3.8366013071895426, + "grad_norm": 1.141430377960205, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 5870 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.6595550775527954, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 5880 + }, + { + "epoch": 3.849673202614379, + "grad_norm": 0.7499435544013977, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 5890 + }, + { + "epoch": 3.8562091503267975, + "grad_norm": 0.7851517200469971, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5900 + }, + { + "epoch": 3.8627450980392157, + "grad_norm": 1.0533545017242432, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 5910 + }, + { + "epoch": 3.869281045751634, + "grad_norm": 0.960086464881897, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5920 + }, + { + "epoch": 3.8758169934640523, + "grad_norm": 0.9952049851417542, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 5930 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.7884191274642944, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 5940 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.7461766600608826, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5950 + }, + { + "epoch": 3.895424836601307, + "grad_norm": 0.9594355821609497, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 5960 + }, + { + "epoch": 3.9019607843137254, + "grad_norm": 0.8179471492767334, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 5970 + }, + { + "epoch": 3.9084967320261437, + "grad_norm": 0.8240267634391785, + "learning_rate": 0.0002, + "loss": 1.2421, + "step": 5980 + }, + { + "epoch": 3.915032679738562, + "grad_norm": 0.7462618350982666, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 5990 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.711207389831543, + "learning_rate": 0.0002, + "loss": 1.2124, + "step": 6000 + }, + { + "epoch": 3.928104575163399, + "grad_norm": 0.6910956501960754, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 6010 + }, + { + "epoch": 3.9346405228758172, + "grad_norm": 0.749093770980835, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 6020 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.3332762718200684, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6030 + }, + { + "epoch": 3.947712418300654, + "grad_norm": 0.71457439661026, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6040 + }, + { + "epoch": 3.954248366013072, + "grad_norm": 1.1205238103866577, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 6050 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.6958928108215332, + "learning_rate": 0.0002, + "loss": 1.2962, + "step": 6060 + }, + { + "epoch": 3.9673202614379086, + "grad_norm": 0.7518056035041809, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 6070 + }, + { + "epoch": 3.973856209150327, + "grad_norm": 0.8010755777359009, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 6080 + }, + { + "epoch": 3.980392156862745, + "grad_norm": 0.7492658495903015, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6090 + }, + { + "epoch": 3.9869281045751634, + "grad_norm": 0.900704562664032, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 6100 + }, + { + "epoch": 3.9934640522875817, + "grad_norm": 0.7997331619262695, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 6110 + }, + { + "epoch": 4.0, + "grad_norm": 0.7163209319114685, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 6120 + }, + { + "epoch": 4.0, + "eval_loss": 1.4113320112228394, + "eval_runtime": 33.7199, + "eval_samples_per_second": 12.93, + "eval_steps_per_second": 1.631, + "step": 6120 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.479861757083648e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..42757da2e98e15d76082d6943c5f99b90bf573a0 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae26adafc586de04059c8bcf1b7ea29a4bfc43656d2df3ccbc5fb98d67c85bd4 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f0ba88abb7145c0f96e3d330e000a035284e4eb --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adddafbad48fa0f66d361b088729b9594d7d901b40b54f0c49c037c3f9be4c +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfd6bb6adb15181e548ba66c119648c547dc307a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaf688fb3b028b2cbf957877240fe2f9bbb9f7be9a1796382a4722d9fb5b539f +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5612980a5f6d95b62c57b4b2e536723f259a0828 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b28dc3be2984476e1b98fe5a611b98bf32443bfdeadbedb0d6d08ca284c9087 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..47667316109e621fedc49ae6867683fcbf05d271 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/trainer_state.json @@ -0,0 +1,5428 @@ +{ + "best_metric": 1.4113320112228394, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 7650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + }, + { + "epoch": 3.0065359477124183, + "grad_norm": 0.6926420331001282, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4600 + }, + { + "epoch": 3.0130718954248366, + "grad_norm": 0.6656355857849121, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 4610 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.9901936650276184, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 4620 + }, + { + "epoch": 3.026143790849673, + "grad_norm": 0.6713474988937378, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 4630 + }, + { + "epoch": 3.0326797385620914, + "grad_norm": 0.6199324131011963, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 4640 + }, + { + "epoch": 3.0392156862745097, + "grad_norm": 0.7180785536766052, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 4650 + }, + { + "epoch": 3.045751633986928, + "grad_norm": 0.8256588578224182, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 4660 + }, + { + "epoch": 3.052287581699346, + "grad_norm": 0.6637389063835144, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 4670 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.6980698108673096, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 4680 + }, + { + "epoch": 3.065359477124183, + "grad_norm": 0.8091534972190857, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 4690 + }, + { + "epoch": 3.0718954248366015, + "grad_norm": 0.5715174078941345, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 4700 + }, + { + "epoch": 3.0784313725490198, + "grad_norm": 0.735639750957489, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 4710 + }, + { + "epoch": 3.084967320261438, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.0002, + "loss": 1.1522, + "step": 4720 + }, + { + "epoch": 3.0915032679738563, + "grad_norm": 1.263566017150879, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 4730 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.6600871682167053, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4740 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 0.717792809009552, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 4750 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.853714644908905, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 4760 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.1004153490066528, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 4770 + }, + { + "epoch": 3.1241830065359477, + "grad_norm": 0.8566235899925232, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 4780 + }, + { + "epoch": 3.130718954248366, + "grad_norm": 0.8315296173095703, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 4790 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8020524978637695, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 4800 + }, + { + "epoch": 3.1437908496732025, + "grad_norm": 0.7564275860786438, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 4810 + }, + { + "epoch": 3.150326797385621, + "grad_norm": 0.9077776670455933, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 4820 + }, + { + "epoch": 3.156862745098039, + "grad_norm": 0.6323099732398987, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 4830 + }, + { + "epoch": 3.1633986928104574, + "grad_norm": 0.6625368595123291, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 3.1699346405228757, + "grad_norm": 0.8119261860847473, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 4850 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.6399450898170471, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 4860 + }, + { + "epoch": 3.183006535947712, + "grad_norm": 1.0659016370773315, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 4870 + }, + { + "epoch": 3.189542483660131, + "grad_norm": 0.8040369749069214, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4880 + }, + { + "epoch": 3.196078431372549, + "grad_norm": 0.7784733176231384, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4890 + }, + { + "epoch": 3.2026143790849675, + "grad_norm": 0.9660294651985168, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4900 + }, + { + "epoch": 3.2091503267973858, + "grad_norm": 1.0676977634429932, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 4910 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.5877565741539001, + "learning_rate": 0.0002, + "loss": 1.0083, + "step": 4920 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.6164032816886902, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 4930 + }, + { + "epoch": 3.2287581699346406, + "grad_norm": 0.7627606987953186, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 4940 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 0.7442803978919983, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 4950 + }, + { + "epoch": 3.241830065359477, + "grad_norm": 0.7277812361717224, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 4960 + }, + { + "epoch": 3.2483660130718954, + "grad_norm": 1.0301902294158936, + "learning_rate": 0.0002, + "loss": 1.2237, + "step": 4970 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.7798232436180115, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 4980 + }, + { + "epoch": 3.261437908496732, + "grad_norm": 1.210265874862671, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 4990 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 0.6677713990211487, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 5000 + }, + { + "epoch": 3.2745098039215685, + "grad_norm": 1.0524500608444214, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 5010 + }, + { + "epoch": 3.281045751633987, + "grad_norm": 0.7091745734214783, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5020 + }, + { + "epoch": 3.287581699346405, + "grad_norm": 0.8523224592208862, + "learning_rate": 0.0002, + "loss": 1.1891, + "step": 5030 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.6120608448982239, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 5040 + }, + { + "epoch": 3.3006535947712417, + "grad_norm": 0.7437472939491272, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 5050 + }, + { + "epoch": 3.30718954248366, + "grad_norm": 0.7611715197563171, + "learning_rate": 0.0002, + "loss": 1.1295, + "step": 5060 + }, + { + "epoch": 3.313725490196078, + "grad_norm": 0.7249704003334045, + "learning_rate": 0.0002, + "loss": 1.0531, + "step": 5070 + }, + { + "epoch": 3.3202614379084965, + "grad_norm": 0.7316247820854187, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5080 + }, + { + "epoch": 3.326797385620915, + "grad_norm": 0.562412440776825, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5090 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7052176594734192, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5100 + }, + { + "epoch": 3.3398692810457518, + "grad_norm": 0.7714211344718933, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 5110 + }, + { + "epoch": 3.34640522875817, + "grad_norm": 1.0436055660247803, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5120 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.8867271542549133, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 5130 + }, + { + "epoch": 3.3594771241830066, + "grad_norm": 0.8371267914772034, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 5140 + }, + { + "epoch": 3.366013071895425, + "grad_norm": 0.7257837057113647, + "learning_rate": 0.0002, + "loss": 1.1073, + "step": 5150 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.7102002501487732, + "learning_rate": 0.0002, + "loss": 1.1162, + "step": 5160 + }, + { + "epoch": 3.3790849673202614, + "grad_norm": 0.7636350393295288, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 5170 + }, + { + "epoch": 3.3856209150326797, + "grad_norm": 0.6887359619140625, + "learning_rate": 0.0002, + "loss": 1.0708, + "step": 5180 + }, + { + "epoch": 3.392156862745098, + "grad_norm": 0.8141424655914307, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 5190 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.694423496723175, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5200 + }, + { + "epoch": 3.4052287581699345, + "grad_norm": 0.914013683795929, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5210 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.8503239750862122, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 5220 + }, + { + "epoch": 3.418300653594771, + "grad_norm": 0.6196836233139038, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 5230 + }, + { + "epoch": 3.4248366013071894, + "grad_norm": 1.0760811567306519, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 5240 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 0.6524698138237, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 5250 + }, + { + "epoch": 3.4379084967320264, + "grad_norm": 0.674467921257019, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5260 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7690372467041016, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 5270 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.8751813769340515, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 5280 + }, + { + "epoch": 3.457516339869281, + "grad_norm": 0.750407874584198, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 5290 + }, + { + "epoch": 3.4640522875816995, + "grad_norm": 0.5991823077201843, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 5300 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.0164772272109985, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 5310 + }, + { + "epoch": 3.477124183006536, + "grad_norm": 0.8704105019569397, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 5320 + }, + { + "epoch": 3.4836601307189543, + "grad_norm": 0.709102213382721, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 5330 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.6273632049560547, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 5340 + }, + { + "epoch": 3.496732026143791, + "grad_norm": 0.6807359457015991, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 5350 + }, + { + "epoch": 3.503267973856209, + "grad_norm": 0.7085188627243042, + "learning_rate": 0.0002, + "loss": 1.131, + "step": 5360 + }, + { + "epoch": 3.5098039215686274, + "grad_norm": 0.6938307881355286, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5370 + }, + { + "epoch": 3.5163398692810457, + "grad_norm": 0.8544146418571472, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 5380 + }, + { + "epoch": 3.522875816993464, + "grad_norm": 0.7889642119407654, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 5390 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.7858421206474304, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 5400 + }, + { + "epoch": 3.5359477124183005, + "grad_norm": 0.8547123074531555, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5410 + }, + { + "epoch": 3.542483660130719, + "grad_norm": 0.8218181133270264, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 5420 + }, + { + "epoch": 3.549019607843137, + "grad_norm": 1.153623342514038, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 5430 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.1321099996566772, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 5440 + }, + { + "epoch": 3.5620915032679736, + "grad_norm": 0.9495334029197693, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 5450 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.8743821978569031, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 5460 + }, + { + "epoch": 3.57516339869281, + "grad_norm": 0.7513086795806885, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 5470 + }, + { + "epoch": 3.581699346405229, + "grad_norm": 1.0139480829238892, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 5480 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 0.6615135073661804, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 5490 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 1.180798888206482, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 5500 + }, + { + "epoch": 3.6013071895424837, + "grad_norm": 0.7085279226303101, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 5510 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.540268063545227, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 5520 + }, + { + "epoch": 3.6143790849673203, + "grad_norm": 0.7905671000480652, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 5530 + }, + { + "epoch": 3.6209150326797386, + "grad_norm": 0.8457717299461365, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 5540 + }, + { + "epoch": 3.627450980392157, + "grad_norm": 0.7102677822113037, + "learning_rate": 0.0002, + "loss": 1.1799, + "step": 5550 + }, + { + "epoch": 3.633986928104575, + "grad_norm": 0.7179514765739441, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 5560 + }, + { + "epoch": 3.6405228758169934, + "grad_norm": 1.0854148864746094, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 5570 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.8209951519966125, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5580 + }, + { + "epoch": 3.65359477124183, + "grad_norm": 0.6944138407707214, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 5590 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.7675473093986511, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 5600 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 5610 + }, + { + "epoch": 3.6732026143790852, + "grad_norm": 0.7920727133750916, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 5620 + }, + { + "epoch": 3.6797385620915035, + "grad_norm": 0.9440218806266785, + "learning_rate": 0.0002, + "loss": 1.2287, + "step": 5630 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.6600824594497681, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 5640 + }, + { + "epoch": 3.69281045751634, + "grad_norm": 0.6860619187355042, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 5650 + }, + { + "epoch": 3.6993464052287583, + "grad_norm": 0.6579713225364685, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 5660 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.661081075668335, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 5670 + }, + { + "epoch": 3.712418300653595, + "grad_norm": 1.0968825817108154, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 5680 + }, + { + "epoch": 3.718954248366013, + "grad_norm": 0.8066844940185547, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 5690 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.8341682553291321, + "learning_rate": 0.0002, + "loss": 1.2322, + "step": 5700 + }, + { + "epoch": 3.7320261437908497, + "grad_norm": 0.6682852506637573, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 5710 + }, + { + "epoch": 3.738562091503268, + "grad_norm": 0.898595929145813, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 5720 + }, + { + "epoch": 3.7450980392156863, + "grad_norm": 0.6876054406166077, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 5730 + }, + { + "epoch": 3.7516339869281046, + "grad_norm": 0.7817103266716003, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 5740 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 0.5840168595314026, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 5750 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.6263918876647949, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5760 + }, + { + "epoch": 3.7712418300653594, + "grad_norm": 0.7948952317237854, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 5770 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6700998544692993, + "learning_rate": 0.0002, + "loss": 1.149, + "step": 5780 + }, + { + "epoch": 3.784313725490196, + "grad_norm": 1.1169519424438477, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 5790 + }, + { + "epoch": 3.7908496732026142, + "grad_norm": 0.8354471325874329, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5800 + }, + { + "epoch": 3.7973856209150325, + "grad_norm": 0.6304181814193726, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 5810 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.6919655799865723, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 5820 + }, + { + "epoch": 3.810457516339869, + "grad_norm": 0.600385844707489, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5830 + }, + { + "epoch": 3.8169934640522873, + "grad_norm": 0.8406319618225098, + "learning_rate": 0.0002, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 0.7594282031059265, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5850 + }, + { + "epoch": 3.8300653594771243, + "grad_norm": 0.8179879784584045, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 5860 + }, + { + "epoch": 3.8366013071895426, + "grad_norm": 1.141430377960205, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 5870 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.6595550775527954, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 5880 + }, + { + "epoch": 3.849673202614379, + "grad_norm": 0.7499435544013977, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 5890 + }, + { + "epoch": 3.8562091503267975, + "grad_norm": 0.7851517200469971, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5900 + }, + { + "epoch": 3.8627450980392157, + "grad_norm": 1.0533545017242432, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 5910 + }, + { + "epoch": 3.869281045751634, + "grad_norm": 0.960086464881897, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5920 + }, + { + "epoch": 3.8758169934640523, + "grad_norm": 0.9952049851417542, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 5930 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.7884191274642944, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 5940 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.7461766600608826, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5950 + }, + { + "epoch": 3.895424836601307, + "grad_norm": 0.9594355821609497, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 5960 + }, + { + "epoch": 3.9019607843137254, + "grad_norm": 0.8179471492767334, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 5970 + }, + { + "epoch": 3.9084967320261437, + "grad_norm": 0.8240267634391785, + "learning_rate": 0.0002, + "loss": 1.2421, + "step": 5980 + }, + { + "epoch": 3.915032679738562, + "grad_norm": 0.7462618350982666, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 5990 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.711207389831543, + "learning_rate": 0.0002, + "loss": 1.2124, + "step": 6000 + }, + { + "epoch": 3.928104575163399, + "grad_norm": 0.6910956501960754, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 6010 + }, + { + "epoch": 3.9346405228758172, + "grad_norm": 0.749093770980835, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 6020 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.3332762718200684, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6030 + }, + { + "epoch": 3.947712418300654, + "grad_norm": 0.71457439661026, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6040 + }, + { + "epoch": 3.954248366013072, + "grad_norm": 1.1205238103866577, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 6050 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.6958928108215332, + "learning_rate": 0.0002, + "loss": 1.2962, + "step": 6060 + }, + { + "epoch": 3.9673202614379086, + "grad_norm": 0.7518056035041809, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 6070 + }, + { + "epoch": 3.973856209150327, + "grad_norm": 0.8010755777359009, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 6080 + }, + { + "epoch": 3.980392156862745, + "grad_norm": 0.7492658495903015, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6090 + }, + { + "epoch": 3.9869281045751634, + "grad_norm": 0.900704562664032, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 6100 + }, + { + "epoch": 3.9934640522875817, + "grad_norm": 0.7997331619262695, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 6110 + }, + { + "epoch": 4.0, + "grad_norm": 0.7163209319114685, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 6120 + }, + { + "epoch": 4.0, + "eval_loss": 1.4113320112228394, + "eval_runtime": 33.7199, + "eval_samples_per_second": 12.93, + "eval_steps_per_second": 1.631, + "step": 6120 + }, + { + "epoch": 4.006535947712418, + "grad_norm": 0.9527022838592529, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 6130 + }, + { + "epoch": 4.0130718954248366, + "grad_norm": 0.7603210210800171, + "learning_rate": 0.0002, + "loss": 1.101, + "step": 6140 + }, + { + "epoch": 4.019607843137255, + "grad_norm": 1.127387523651123, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 6150 + }, + { + "epoch": 4.026143790849673, + "grad_norm": 0.8290133476257324, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 6160 + }, + { + "epoch": 4.032679738562091, + "grad_norm": 0.9912241101264954, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 6170 + }, + { + "epoch": 4.03921568627451, + "grad_norm": 0.947005033493042, + "learning_rate": 0.0002, + "loss": 1.0719, + "step": 6180 + }, + { + "epoch": 4.045751633986928, + "grad_norm": 0.707466185092926, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 6190 + }, + { + "epoch": 4.052287581699346, + "grad_norm": 1.0604327917099, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6200 + }, + { + "epoch": 4.0588235294117645, + "grad_norm": 0.7848685383796692, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6210 + }, + { + "epoch": 4.065359477124183, + "grad_norm": 0.8475256562232971, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 6220 + }, + { + "epoch": 4.071895424836601, + "grad_norm": 0.9759448766708374, + "learning_rate": 0.0002, + "loss": 1.1104, + "step": 6230 + }, + { + "epoch": 4.078431372549019, + "grad_norm": 0.9324519038200378, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 6240 + }, + { + "epoch": 4.084967320261438, + "grad_norm": 0.8723901510238647, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 6250 + }, + { + "epoch": 4.091503267973856, + "grad_norm": 0.8343415856361389, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6260 + }, + { + "epoch": 4.098039215686274, + "grad_norm": 0.7490310072898865, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 6270 + }, + { + "epoch": 4.104575163398692, + "grad_norm": 0.8961182832717896, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 6280 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7124854922294617, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 6290 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.8338138461112976, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 6300 + }, + { + "epoch": 4.124183006535947, + "grad_norm": 0.8075833320617676, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 6310 + }, + { + "epoch": 4.130718954248366, + "grad_norm": 0.8069391846656799, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 6320 + }, + { + "epoch": 4.137254901960785, + "grad_norm": 0.9567893147468567, + "learning_rate": 0.0002, + "loss": 0.948, + "step": 6330 + }, + { + "epoch": 4.143790849673203, + "grad_norm": 1.2184662818908691, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 6340 + }, + { + "epoch": 4.150326797385621, + "grad_norm": 1.030976414680481, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 6350 + }, + { + "epoch": 4.1568627450980395, + "grad_norm": 0.9749957323074341, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6360 + }, + { + "epoch": 4.163398692810458, + "grad_norm": 0.7089483141899109, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 6370 + }, + { + "epoch": 4.169934640522876, + "grad_norm": 1.1084946393966675, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 6380 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 0.7998497486114502, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 6390 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 0.8997811675071716, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 6400 + }, + { + "epoch": 4.189542483660131, + "grad_norm": 0.8359479904174805, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 6410 + }, + { + "epoch": 4.196078431372549, + "grad_norm": 0.9087472558021545, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 6420 + }, + { + "epoch": 4.2026143790849675, + "grad_norm": 1.1100451946258545, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 6430 + }, + { + "epoch": 4.209150326797386, + "grad_norm": 0.9376999735832214, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 6440 + }, + { + "epoch": 4.215686274509804, + "grad_norm": 0.8179266452789307, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 6450 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9953271746635437, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 6460 + }, + { + "epoch": 4.228758169934641, + "grad_norm": 0.8476650714874268, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6470 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.8406323194503784, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6480 + }, + { + "epoch": 4.241830065359477, + "grad_norm": 0.819134533405304, + "learning_rate": 0.0002, + "loss": 1.057, + "step": 6490 + }, + { + "epoch": 4.248366013071895, + "grad_norm": 0.7764983773231506, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 6500 + }, + { + "epoch": 4.254901960784314, + "grad_norm": 0.8252112865447998, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 6510 + }, + { + "epoch": 4.261437908496732, + "grad_norm": 0.7941019535064697, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6520 + }, + { + "epoch": 4.26797385620915, + "grad_norm": 0.7673905491828918, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 6530 + }, + { + "epoch": 4.2745098039215685, + "grad_norm": 0.8749890327453613, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6540 + }, + { + "epoch": 4.281045751633987, + "grad_norm": 0.7343207597732544, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 4.287581699346405, + "grad_norm": 1.2786651849746704, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6560 + }, + { + "epoch": 4.294117647058823, + "grad_norm": 1.316875696182251, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6570 + }, + { + "epoch": 4.300653594771242, + "grad_norm": 0.8349189162254333, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 6580 + }, + { + "epoch": 4.30718954248366, + "grad_norm": 0.7510647177696228, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6590 + }, + { + "epoch": 4.313725490196078, + "grad_norm": 0.932420551776886, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6600 + }, + { + "epoch": 4.3202614379084965, + "grad_norm": 0.8510616421699524, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 6610 + }, + { + "epoch": 4.326797385620915, + "grad_norm": 0.7661547064781189, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6620 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.0370930433273315, + "learning_rate": 0.0002, + "loss": 1.2064, + "step": 6630 + }, + { + "epoch": 4.339869281045751, + "grad_norm": 0.9302158951759338, + "learning_rate": 0.0002, + "loss": 1.1064, + "step": 6640 + }, + { + "epoch": 4.34640522875817, + "grad_norm": 0.9203811883926392, + "learning_rate": 0.0002, + "loss": 0.968, + "step": 6650 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.9986332654953003, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 6660 + }, + { + "epoch": 4.359477124183006, + "grad_norm": 0.8001713156700134, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6670 + }, + { + "epoch": 4.366013071895424, + "grad_norm": 0.829714298248291, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 6680 + }, + { + "epoch": 4.372549019607844, + "grad_norm": 0.8253079056739807, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 6690 + }, + { + "epoch": 4.379084967320262, + "grad_norm": 0.824666440486908, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 6700 + }, + { + "epoch": 4.38562091503268, + "grad_norm": 0.8872972130775452, + "learning_rate": 0.0002, + "loss": 1.1968, + "step": 6710 + }, + { + "epoch": 4.392156862745098, + "grad_norm": 0.8729761838912964, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 6720 + }, + { + "epoch": 4.398692810457517, + "grad_norm": 1.1367264986038208, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 6730 + }, + { + "epoch": 4.405228758169935, + "grad_norm": 0.9699058532714844, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 6740 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 0.8266763687133789, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 6750 + }, + { + "epoch": 4.4183006535947715, + "grad_norm": 1.0249767303466797, + "learning_rate": 0.0002, + "loss": 1.0735, + "step": 6760 + }, + { + "epoch": 4.42483660130719, + "grad_norm": 0.73606938123703, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 6770 + }, + { + "epoch": 4.431372549019608, + "grad_norm": 1.4050679206848145, + "learning_rate": 0.0002, + "loss": 1.1037, + "step": 6780 + }, + { + "epoch": 4.437908496732026, + "grad_norm": 1.1114081144332886, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6790 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8031067848205566, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 6800 + }, + { + "epoch": 4.450980392156863, + "grad_norm": 0.8513566851615906, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6810 + }, + { + "epoch": 4.457516339869281, + "grad_norm": 1.332741379737854, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 6820 + }, + { + "epoch": 4.4640522875816995, + "grad_norm": 1.5032578706741333, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 6830 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.7677283883094788, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6840 + }, + { + "epoch": 4.477124183006536, + "grad_norm": 0.989148736000061, + "learning_rate": 0.0002, + "loss": 1.1501, + "step": 6850 + }, + { + "epoch": 4.483660130718954, + "grad_norm": 1.5316275358200073, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6860 + }, + { + "epoch": 4.490196078431373, + "grad_norm": 0.9427124261856079, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 6870 + }, + { + "epoch": 4.496732026143791, + "grad_norm": 1.215287685394287, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 6880 + }, + { + "epoch": 4.503267973856209, + "grad_norm": 0.7286760210990906, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 6890 + }, + { + "epoch": 4.509803921568627, + "grad_norm": 0.874829888343811, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6900 + }, + { + "epoch": 4.516339869281046, + "grad_norm": 0.8058359622955322, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 6910 + }, + { + "epoch": 4.522875816993464, + "grad_norm": 1.248195767402649, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 0.8033645749092102, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 6930 + }, + { + "epoch": 4.5359477124183005, + "grad_norm": 1.7361950874328613, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 6940 + }, + { + "epoch": 4.542483660130719, + "grad_norm": 0.8058095574378967, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 6950 + }, + { + "epoch": 4.549019607843137, + "grad_norm": 1.254089593887329, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 6960 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.9180455803871155, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 6970 + }, + { + "epoch": 4.562091503267974, + "grad_norm": 0.6677682399749756, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 6980 + }, + { + "epoch": 4.568627450980392, + "grad_norm": 0.8127354383468628, + "learning_rate": 0.0002, + "loss": 1.0453, + "step": 6990 + }, + { + "epoch": 4.57516339869281, + "grad_norm": 1.0263001918792725, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7000 + }, + { + "epoch": 4.5816993464052285, + "grad_norm": 0.9641909003257751, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 7010 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 0.9440861344337463, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 7020 + }, + { + "epoch": 4.594771241830065, + "grad_norm": 0.9539011716842651, + "learning_rate": 0.0002, + "loss": 1.0931, + "step": 7030 + }, + { + "epoch": 4.601307189542483, + "grad_norm": 1.0449910163879395, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7040 + }, + { + "epoch": 4.607843137254902, + "grad_norm": 0.8766893744468689, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7050 + }, + { + "epoch": 4.61437908496732, + "grad_norm": 0.6983462572097778, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 7060 + }, + { + "epoch": 4.620915032679738, + "grad_norm": 0.9505505561828613, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 7070 + }, + { + "epoch": 4.627450980392156, + "grad_norm": 1.2506657838821411, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7080 + }, + { + "epoch": 4.633986928104575, + "grad_norm": 0.9602801203727722, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 7090 + }, + { + "epoch": 4.640522875816993, + "grad_norm": 0.7398977875709534, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7100 + }, + { + "epoch": 4.647058823529412, + "grad_norm": 1.3862425088882446, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 7110 + }, + { + "epoch": 4.65359477124183, + "grad_norm": 1.1451990604400635, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 7120 + }, + { + "epoch": 4.660130718954249, + "grad_norm": 0.9010422229766846, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 7130 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.7102518081665039, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 7140 + }, + { + "epoch": 4.673202614379085, + "grad_norm": 0.7963796257972717, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7150 + }, + { + "epoch": 4.6797385620915035, + "grad_norm": 0.7726007699966431, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 7160 + }, + { + "epoch": 4.686274509803922, + "grad_norm": 0.8097564578056335, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 7170 + }, + { + "epoch": 4.69281045751634, + "grad_norm": 0.9070925116539001, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 7180 + }, + { + "epoch": 4.699346405228758, + "grad_norm": 0.7543528079986572, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 7190 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9900904893875122, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7200 + }, + { + "epoch": 4.712418300653595, + "grad_norm": 0.8033412098884583, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 7210 + }, + { + "epoch": 4.718954248366013, + "grad_norm": 0.8440839052200317, + "learning_rate": 0.0002, + "loss": 1.1773, + "step": 7220 + }, + { + "epoch": 4.7254901960784315, + "grad_norm": 0.9325555562973022, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 7230 + }, + { + "epoch": 4.73202614379085, + "grad_norm": 0.7881146669387817, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7240 + }, + { + "epoch": 4.738562091503268, + "grad_norm": 0.884453296661377, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7250 + }, + { + "epoch": 4.745098039215686, + "grad_norm": 0.9274539351463318, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7260 + }, + { + "epoch": 4.751633986928105, + "grad_norm": 1.2367479801177979, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7270 + }, + { + "epoch": 4.758169934640523, + "grad_norm": 0.9499821066856384, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7280 + }, + { + "epoch": 4.764705882352941, + "grad_norm": 2.1918580532073975, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 7290 + }, + { + "epoch": 4.771241830065359, + "grad_norm": 0.8221880793571472, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 7300 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.871972918510437, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7310 + }, + { + "epoch": 4.784313725490196, + "grad_norm": 0.8034510612487793, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 7320 + }, + { + "epoch": 4.790849673202614, + "grad_norm": 0.8959605693817139, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 7330 + }, + { + "epoch": 4.7973856209150325, + "grad_norm": 1.2326215505599976, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7340 + }, + { + "epoch": 4.803921568627451, + "grad_norm": 0.9725791811943054, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7350 + }, + { + "epoch": 4.810457516339869, + "grad_norm": 0.7240816354751587, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 7360 + }, + { + "epoch": 4.816993464052287, + "grad_norm": 0.8265769481658936, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 7370 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 0.8888696432113647, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 7380 + }, + { + "epoch": 4.830065359477124, + "grad_norm": 0.7776556015014648, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 7390 + }, + { + "epoch": 4.836601307189542, + "grad_norm": 0.8772371411323547, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7400 + }, + { + "epoch": 4.8431372549019605, + "grad_norm": 0.9786531925201416, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7410 + }, + { + "epoch": 4.849673202614379, + "grad_norm": 0.9059745073318481, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7420 + }, + { + "epoch": 4.856209150326797, + "grad_norm": 0.7422552108764648, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 7430 + }, + { + "epoch": 4.862745098039216, + "grad_norm": 1.3040380477905273, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 7440 + }, + { + "epoch": 4.8692810457516345, + "grad_norm": 1.3278473615646362, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 7450 + }, + { + "epoch": 4.875816993464053, + "grad_norm": 1.2705849409103394, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 7460 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 0.8837892413139343, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 7470 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8670691251754761, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 7480 + }, + { + "epoch": 4.895424836601308, + "grad_norm": 0.9662758111953735, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 7490 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 0.8188302516937256, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 7500 + }, + { + "epoch": 4.908496732026144, + "grad_norm": 0.769442617893219, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 7510 + }, + { + "epoch": 4.915032679738562, + "grad_norm": 1.1465084552764893, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 7520 + }, + { + "epoch": 4.921568627450981, + "grad_norm": 1.253214955329895, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 7530 + }, + { + "epoch": 4.928104575163399, + "grad_norm": 0.7922375202178955, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 7540 + }, + { + "epoch": 4.934640522875817, + "grad_norm": 0.8306851387023926, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7550 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.8486151099205017, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 7560 + }, + { + "epoch": 4.947712418300654, + "grad_norm": 1.2601467370986938, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7570 + }, + { + "epoch": 4.954248366013072, + "grad_norm": 0.7980747818946838, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 7580 + }, + { + "epoch": 4.96078431372549, + "grad_norm": 0.8653254508972168, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 7590 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 0.9680571556091309, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 7600 + }, + { + "epoch": 4.973856209150327, + "grad_norm": 0.9554466605186462, + "learning_rate": 0.0002, + "loss": 1.1795, + "step": 7610 + }, + { + "epoch": 4.980392156862745, + "grad_norm": 1.3693897724151611, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 7620 + }, + { + "epoch": 4.9869281045751634, + "grad_norm": 0.7809282541275024, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 7630 + }, + { + "epoch": 4.993464052287582, + "grad_norm": 0.7528006434440613, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 7640 + }, + { + "epoch": 5.0, + "grad_norm": 1.7491309642791748, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 7650 + }, + { + "epoch": 5.0, + "eval_loss": 1.4197258949279785, + "eval_runtime": 33.6327, + "eval_samples_per_second": 12.964, + "eval_steps_per_second": 1.635, + "step": 7650 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.34982719635456e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0a815a13a8a67babedd2d8bbc4583115dc94bdcc --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fbd4b8470273ecd80cc3a19d933def3cdeffe6e1729bef1ba024865e4886a51 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dc76144022dd2b16db1d8ed4a853c1f26910035 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e58d2e4b8f746e1207bfe178bcfe44664c6c98e027ad47204c8b6738773847 +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..56eef58f5144cf90942b24fa10bf716b36463061 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78bd575ea1ef2db1c66988ade36263f1e07ea2252e737172bd8ae82ecf081c4a +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e90f720eb8a510417707100c1ea1a627e13e392 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77fc51c9f2de817ea625082e092892004411da15fca41ad0dc86bc55cb1abd77 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..452e63cd783d5ca3a2a3136c5f42c705ab75dd78 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/trainer_state.json @@ -0,0 +1,6507 @@ +{ + "best_metric": 1.4113320112228394, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 9180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006535947712418301, + "grad_norm": 1.5105072259902954, + "learning_rate": 0.0002, + "loss": 4.7451, + "step": 10 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 2.1156165599823, + "learning_rate": 0.0002, + "loss": 3.3158, + "step": 20 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 1.0578808784484863, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 30 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 2.725064516067505, + "learning_rate": 0.0002, + "loss": 2.3948, + "step": 40 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 2.9575750827789307, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 50 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.2158117294311523, + "learning_rate": 0.0002, + "loss": 2.2778, + "step": 60 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.0850954055786133, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 1.299196720123291, + "learning_rate": 0.0002, + "loss": 1.8872, + "step": 80 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8310191035270691, + "learning_rate": 0.0002, + "loss": 1.947, + "step": 90 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9854435920715332, + "learning_rate": 0.0002, + "loss": 1.9098, + "step": 100 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.7951157689094543, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 110 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.7593062520027161, + "learning_rate": 0.0002, + "loss": 1.9035, + "step": 120 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.6783032417297363, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 130 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8350756764411926, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 140 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.0203173160552979, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 150 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8820539712905884, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 160 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7286128997802734, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 170 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.7874041795730591, + "learning_rate": 0.0002, + "loss": 1.8841, + "step": 180 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6630475521087646, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 190 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.686413586139679, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 200 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7793629765510559, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 210 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.6893141865730286, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 220 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.5804724097251892, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 230 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6053574085235596, + "learning_rate": 0.0002, + "loss": 1.5578, + "step": 240 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.7566025853157043, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 250 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.6112990975379944, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 260 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6839066743850708, + "learning_rate": 0.0002, + "loss": 1.5564, + "step": 270 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.6368117928504944, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 280 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6144475936889648, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 290 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.6743767261505127, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 300 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6807955503463745, + "learning_rate": 0.0002, + "loss": 1.421, + "step": 310 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6717963814735413, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 320 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5917780995368958, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 330 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6783658862113953, + "learning_rate": 0.0002, + "loss": 1.6264, + "step": 340 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5820256471633911, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 350 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.5345938801765442, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.755929172039032, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 370 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.6183189749717712, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 380 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.7277782559394836, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 390 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.9998756051063538, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 400 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.7523853778839111, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 410 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.6548714637756348, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 420 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6979796290397644, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 430 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.840915322303772, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 440 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.6142978072166443, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 450 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.9482691884040833, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 460 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.7001156806945801, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 470 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.6665455102920532, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 480 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.6012697815895081, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 490 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.8770062327384949, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7029962539672852, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 510 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.6682832837104797, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 520 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5548969507217407, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 530 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6640702486038208, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 540 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.656292200088501, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 550 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.618910551071167, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 560 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.644859790802002, + "learning_rate": 0.0002, + "loss": 1.5178, + "step": 570 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 580 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.980681836605072, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 590 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.632219672203064, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 600 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.7003744840621948, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 610 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.7090577483177185, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 620 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.657819926738739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 630 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.7034208178520203, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 640 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.7274866104125977, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 650 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.5876233577728271, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 660 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.595494270324707, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 670 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8253804445266724, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 680 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.652225911617279, + "learning_rate": 0.0002, + "loss": 1.5199, + "step": 690 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.6242014169692993, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 700 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.7283986210823059, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 710 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7016081213951111, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 720 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5211893916130066, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 730 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.6221150159835815, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 740 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.76594477891922, + "learning_rate": 0.0002, + "loss": 1.5677, + "step": 750 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5777859091758728, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 760 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.5793519616127014, + "learning_rate": 0.0002, + "loss": 1.5253, + "step": 770 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5425786375999451, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 780 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.6004197001457214, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 790 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7167016863822937, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 800 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.710218071937561, + "learning_rate": 0.0002, + "loss": 1.48, + "step": 810 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.699528694152832, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 820 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.579629123210907, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 830 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.595407247543335, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 840 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.544563889503479, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 850 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.553166389465332, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 860 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.5645018815994263, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 870 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.6576932668685913, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 880 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.6684197187423706, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 890 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.6706975698471069, + "learning_rate": 0.0002, + "loss": 1.5348, + "step": 900 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.6762327551841736, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 910 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.764032244682312, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 920 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.6996400952339172, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.686735987663269, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 940 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.6086131930351257, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 950 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.5627856850624084, + "learning_rate": 0.0002, + "loss": 1.4457, + "step": 960 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.5781503319740295, + "learning_rate": 0.0002, + "loss": 1.506, + "step": 970 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.6347246766090393, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 980 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6581300497055054, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 990 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.8343676924705505, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1000 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.5708910226821899, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 1010 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6832585334777832, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 1020 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.5767837166786194, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1030 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.5637745261192322, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 1040 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.8193050026893616, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 1050 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 1060 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.7476664781570435, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 1070 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.8569361567497253, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1080 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.5671911835670471, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 1090 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.5151128768920898, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1100 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.568037211894989, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 1110 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.6756396889686584, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 1120 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.638975977897644, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 1130 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7103341221809387, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 1140 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.7403952479362488, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1150 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.6266511082649231, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 1160 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1170 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.5735430717468262, + "learning_rate": 0.0002, + "loss": 1.4145, + "step": 1180 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.5155234932899475, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1190 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.5115423202514648, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 1200 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.693588137626648, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1210 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5504693984985352, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 1220 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.5555992126464844, + "learning_rate": 0.0002, + "loss": 1.5412, + "step": 1230 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.7211785316467285, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1240 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.735003650188446, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 1250 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5245152711868286, + "learning_rate": 0.0002, + "loss": 1.5836, + "step": 1260 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.5883445739746094, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 1270 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6835859417915344, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 1280 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6592142581939697, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 1290 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.6087474226951599, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 1300 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.565387487411499, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1310 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.7363151907920837, + "learning_rate": 0.0002, + "loss": 1.4809, + "step": 1320 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.5964524149894714, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 1330 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.5169979929924011, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 1340 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7063422799110413, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 1350 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7261926531791687, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 1360 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.6759744882583618, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.675051212310791, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 1380 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.5613595843315125, + "learning_rate": 0.0002, + "loss": 1.6606, + "step": 1390 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.611732006072998, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1400 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.6365187168121338, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 1410 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.7810426354408264, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1420 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.593891441822052, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 1430 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.761585533618927, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 1440 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.6114464998245239, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1450 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.601044774055481, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1460 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5484876036643982, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 1470 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.5383428335189819, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1480 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.648106575012207, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 1490 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.6847249865531921, + "learning_rate": 0.0002, + "loss": 1.3638, + "step": 1500 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.6361058354377747, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1510 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.646392285823822, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391159057617188, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1530 + }, + { + "epoch": 1.0, + "eval_loss": 1.4715123176574707, + "eval_runtime": 30.5701, + "eval_samples_per_second": 14.262, + "eval_steps_per_second": 1.799, + "step": 1530 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.5468988418579102, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 1540 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.629940927028656, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 1550 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.6411303281784058, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1560 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.5619024038314819, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 1570 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.6093462705612183, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1580 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.5543286204338074, + "learning_rate": 0.0002, + "loss": 1.4547, + "step": 1590 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6079006195068359, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 1600 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.6141977310180664, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 1620 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.5920178294181824, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 1630 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.47620782256126404, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 1640 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.6826292872428894, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 1650 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.6182006597518921, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 1660 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.57639479637146, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 1670 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.6696860194206238, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 1680 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.699221670627594, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 1690 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7138059139251709, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 1700 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.6930422186851501, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 1710 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.7484048008918762, + "learning_rate": 0.0002, + "loss": 1.5033, + "step": 1720 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.5820090174674988, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 1730 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.7143406867980957, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1740 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.5597584247589111, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 1750 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5171173214912415, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 1760 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.5951920747756958, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1770 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.7506247758865356, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 1780 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.5936487913131714, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 1790 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.688450038433075, + "learning_rate": 0.0002, + "loss": 1.3567, + "step": 1800 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.671623170375824, + "learning_rate": 0.0002, + "loss": 1.314, + "step": 1810 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.6911860704421997, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 1820 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.60726398229599, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 1830 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.7542088627815247, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 1840 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.6810969710350037, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 1850 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.579741895198822, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 1860 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.9925695657730103, + "learning_rate": 0.0002, + "loss": 1.4564, + "step": 1870 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.5919767618179321, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 1880 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7377090454101562, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1890 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.5753688812255859, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 1900 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.6362486481666565, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 1910 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.5747467875480652, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1920 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.6831939220428467, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 1930 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.6414040327072144, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 1940 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.5613330006599426, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 1950 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.5838454961776733, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 1960 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 1970 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.5829346776008606, + "learning_rate": 0.0002, + "loss": 1.4602, + "step": 1980 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.756534218788147, + "learning_rate": 0.0002, + "loss": 1.3821, + "step": 1990 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.48002561926841736, + "learning_rate": 0.0002, + "loss": 1.389, + "step": 2000 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.5461082458496094, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 2010 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.570399284362793, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2020 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.5130975842475891, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2030 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6290071606636047, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 2040 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.6165726184844971, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 2050 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.5302083492279053, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 2060 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.6531406044960022, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 2070 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5981236100196838, + "learning_rate": 0.0002, + "loss": 1.3632, + "step": 2080 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.8534150123596191, + "learning_rate": 0.0002, + "loss": 1.4846, + "step": 2090 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.695918083190918, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 2100 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.5830431580543518, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2110 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.5641306638717651, + "learning_rate": 0.0002, + "loss": 1.5009, + "step": 2120 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.6354436874389648, + "learning_rate": 0.0002, + "loss": 1.3985, + "step": 2130 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.5707540512084961, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 2140 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.7308434844017029, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.5879750847816467, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2160 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.627909243106842, + "learning_rate": 0.0002, + "loss": 1.3729, + "step": 2170 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.5228193998336792, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 2180 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.6162880659103394, + "learning_rate": 0.0002, + "loss": 1.457, + "step": 2190 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.751610517501831, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 2200 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5623487234115601, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 2210 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.5293187499046326, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 2220 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.5903629660606384, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 2230 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.6084659099578857, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 2240 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.5289803147315979, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 2250 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.49499568343162537, + "learning_rate": 0.0002, + "loss": 1.3106, + "step": 2260 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.7774190306663513, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 2270 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.5932538509368896, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2280 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.6009492874145508, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 2290 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.5559343099594116, + "learning_rate": 0.0002, + "loss": 1.3728, + "step": 2300 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.5956196188926697, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 2310 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.5624083876609802, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 2320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.7195250391960144, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 2330 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.6010490655899048, + "learning_rate": 0.0002, + "loss": 1.2938, + "step": 2340 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.664929211139679, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 2350 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.5158776640892029, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 2360 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.5147154927253723, + "learning_rate": 0.0002, + "loss": 1.2157, + "step": 2370 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.6507977843284607, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 2380 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.5193192362785339, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 2390 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.5982314944267273, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.49106258153915405, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 2410 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.6459611654281616, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 2420 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.7038363218307495, + "learning_rate": 0.0002, + "loss": 1.3305, + "step": 2430 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.5245680212974548, + "learning_rate": 0.0002, + "loss": 1.3198, + "step": 2440 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.6562076210975647, + "learning_rate": 0.0002, + "loss": 1.4756, + "step": 2450 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.6491968035697937, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 2460 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.604034960269928, + "learning_rate": 0.0002, + "loss": 1.3657, + "step": 2470 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.5759671330451965, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 2480 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.6157698631286621, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2490 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.6513794660568237, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2500 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.71990966796875, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 2510 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.7316617369651794, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 2520 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.5475177764892578, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 2530 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.4911293089389801, + "learning_rate": 0.0002, + "loss": 1.2998, + "step": 2540 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6122882962226868, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.5735281705856323, + "learning_rate": 0.0002, + "loss": 1.3099, + "step": 2560 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.5046352744102478, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 2570 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.6043242812156677, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 2580 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.5397698283195496, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 2590 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.8066475987434387, + "learning_rate": 0.0002, + "loss": 1.4916, + "step": 2600 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.52901691198349, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 2610 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.7588503956794739, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 2620 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.6012966632843018, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 2630 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.5927302837371826, + "learning_rate": 0.0002, + "loss": 1.2583, + "step": 2640 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.5086990594863892, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 2650 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.6000628471374512, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2660 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.6560431718826294, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 2670 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.5738165378570557, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2680 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.5576106905937195, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 2690 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.7298802137374878, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2700 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.5751826167106628, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 2710 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6069957613945007, + "learning_rate": 0.0002, + "loss": 1.35, + "step": 2720 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.7513017654418945, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 2730 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.6058869957923889, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 2740 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.6805883049964905, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2750 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.6864324808120728, + "learning_rate": 0.0002, + "loss": 1.4062, + "step": 2760 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.6261002421379089, + "learning_rate": 0.0002, + "loss": 1.355, + "step": 2770 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.532684862613678, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 2780 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.6209020018577576, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2790 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.67111736536026, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 2800 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.700467586517334, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2810 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.6968029141426086, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 2820 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.6405863761901855, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 2830 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.5192584991455078, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 2840 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.4888569414615631, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 2850 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.7625455856323242, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 2860 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.9162808656692505, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2870 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.5472783446311951, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2880 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5221137404441833, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 2890 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.49258849024772644, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2900 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.5260750651359558, + "learning_rate": 0.0002, + "loss": 1.3503, + "step": 2910 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.6583314538002014, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 2920 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.5728915929794312, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 2930 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.7661453485488892, + "learning_rate": 0.0002, + "loss": 1.3993, + "step": 2940 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.7193911075592041, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2950 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.5007768869400024, + "learning_rate": 0.0002, + "loss": 1.287, + "step": 2960 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.626681923866272, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2970 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.8692840933799744, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 2980 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.6388291120529175, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 2990 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.7710477113723755, + "learning_rate": 0.0002, + "loss": 1.4593, + "step": 3000 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.641704261302948, + "learning_rate": 0.0002, + "loss": 1.5228, + "step": 3010 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.621148943901062, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3020 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.5119547247886658, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 3030 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.8104137778282166, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 3040 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.5856240391731262, + "learning_rate": 0.0002, + "loss": 1.3331, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.5263566374778748, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3060 + }, + { + "epoch": 2.0, + "eval_loss": 1.4276371002197266, + "eval_runtime": 30.5759, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 1.799, + "step": 3060 + }, + { + "epoch": 2.0065359477124183, + "grad_norm": 0.5143898725509644, + "learning_rate": 0.0002, + "loss": 1.1636, + "step": 3070 + }, + { + "epoch": 2.0130718954248366, + "grad_norm": 0.5749367475509644, + "learning_rate": 0.0002, + "loss": 1.3335, + "step": 3080 + }, + { + "epoch": 2.019607843137255, + "grad_norm": 0.5784284472465515, + "learning_rate": 0.0002, + "loss": 1.2784, + "step": 3090 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.5933429598808289, + "learning_rate": 0.0002, + "loss": 1.2463, + "step": 3100 + }, + { + "epoch": 2.0326797385620914, + "grad_norm": 0.6748974919319153, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 3110 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.626399576663971, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 3120 + }, + { + "epoch": 2.045751633986928, + "grad_norm": 0.6173238754272461, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 3130 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.807790219783783, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3140 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 3150 + }, + { + "epoch": 2.065359477124183, + "grad_norm": 0.5859580636024475, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 3160 + }, + { + "epoch": 2.0718954248366015, + "grad_norm": 0.581304132938385, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 3170 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.9814971089363098, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 3180 + }, + { + "epoch": 2.084967320261438, + "grad_norm": 0.6491848230361938, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 3190 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.613680362701416, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3200 + }, + { + "epoch": 2.0980392156862746, + "grad_norm": 0.7318086624145508, + "learning_rate": 0.0002, + "loss": 1.2994, + "step": 3210 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.6025661826133728, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 3220 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.6744484305381775, + "learning_rate": 0.0002, + "loss": 1.1374, + "step": 3230 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.6062554121017456, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 3240 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 0.6801803112030029, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3250 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.5218925476074219, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 3260 + }, + { + "epoch": 2.1372549019607843, + "grad_norm": 0.7494263648986816, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 3270 + }, + { + "epoch": 2.1437908496732025, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 3280 + }, + { + "epoch": 2.150326797385621, + "grad_norm": 0.6836692690849304, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3290 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.619848370552063, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 3300 + }, + { + "epoch": 2.1633986928104574, + "grad_norm": 0.5761294364929199, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 3310 + }, + { + "epoch": 2.1699346405228757, + "grad_norm": 0.4713786542415619, + "learning_rate": 0.0002, + "loss": 1.2883, + "step": 3320 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7613773345947266, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 3330 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.6642718315124512, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 3340 + }, + { + "epoch": 2.189542483660131, + "grad_norm": 0.7162188291549683, + "learning_rate": 0.0002, + "loss": 1.2048, + "step": 3350 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.6916783452033997, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3360 + }, + { + "epoch": 2.2026143790849675, + "grad_norm": 0.7205567955970764, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 3370 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.6038199067115784, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 3380 + }, + { + "epoch": 2.215686274509804, + "grad_norm": 0.6284233927726746, + "learning_rate": 0.0002, + "loss": 1.2079, + "step": 3390 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.7450672388076782, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 3400 + }, + { + "epoch": 2.2287581699346406, + "grad_norm": 0.7755052447319031, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3410 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.9066099524497986, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 3420 + }, + { + "epoch": 2.241830065359477, + "grad_norm": 0.8578207492828369, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 3430 + }, + { + "epoch": 2.2483660130718954, + "grad_norm": 0.5900213718414307, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 3440 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 0.7821717262268066, + "learning_rate": 0.0002, + "loss": 1.3645, + "step": 3450 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.6263150572776794, + "learning_rate": 0.0002, + "loss": 1.183, + "step": 3460 + }, + { + "epoch": 2.2679738562091503, + "grad_norm": 0.591799259185791, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 3470 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.5999799966812134, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 3480 + }, + { + "epoch": 2.281045751633987, + "grad_norm": 0.6227319240570068, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 3490 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.719412624835968, + "learning_rate": 0.0002, + "loss": 1.3865, + "step": 3500 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.0361769199371338, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 3510 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.5506668090820312, + "learning_rate": 0.0002, + "loss": 1.4834, + "step": 3520 + }, + { + "epoch": 2.30718954248366, + "grad_norm": 0.6886829733848572, + "learning_rate": 0.0002, + "loss": 1.2273, + "step": 3530 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.6226346492767334, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 3540 + }, + { + "epoch": 2.3202614379084965, + "grad_norm": 0.8109908103942871, + "learning_rate": 0.0002, + "loss": 1.3087, + "step": 3550 + }, + { + "epoch": 2.326797385620915, + "grad_norm": 0.8505511283874512, + "learning_rate": 0.0002, + "loss": 1.3311, + "step": 3560 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5763760209083557, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3570 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.6460059881210327, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 3580 + }, + { + "epoch": 2.34640522875817, + "grad_norm": 0.7175343036651611, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 3590 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.6012630462646484, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 3600 + }, + { + "epoch": 2.3594771241830066, + "grad_norm": 0.6513685584068298, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3610 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.7465183734893799, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 3620 + }, + { + "epoch": 2.372549019607843, + "grad_norm": 0.6413124203681946, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3630 + }, + { + "epoch": 2.3790849673202614, + "grad_norm": 0.7209562063217163, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 3640 + }, + { + "epoch": 2.3856209150326797, + "grad_norm": 0.6427558660507202, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 3650 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.593958854675293, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 3660 + }, + { + "epoch": 2.3986928104575163, + "grad_norm": 0.5944608449935913, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 3670 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6606248617172241, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3680 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.5632851719856262, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 3690 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.4976513385772705, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3700 + }, + { + "epoch": 2.4248366013071894, + "grad_norm": 0.6318528056144714, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 3710 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.6306707859039307, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 3720 + }, + { + "epoch": 2.4379084967320264, + "grad_norm": 0.6362553238868713, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 3730 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.634368896484375, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3740 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 0.6623591184616089, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3750 + }, + { + "epoch": 2.457516339869281, + "grad_norm": 0.6150440573692322, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 3760 + }, + { + "epoch": 2.4640522875816995, + "grad_norm": 0.588935911655426, + "learning_rate": 0.0002, + "loss": 1.2666, + "step": 3770 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.7388206124305725, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 3780 + }, + { + "epoch": 2.477124183006536, + "grad_norm": 0.621825098991394, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 3790 + }, + { + "epoch": 2.4836601307189543, + "grad_norm": 0.7691677212715149, + "learning_rate": 0.0002, + "loss": 1.359, + "step": 3800 + }, + { + "epoch": 2.4901960784313726, + "grad_norm": 1.1661969423294067, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 3810 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.6837884187698364, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3820 + }, + { + "epoch": 2.503267973856209, + "grad_norm": 0.6978904008865356, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3830 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6121411323547363, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 3840 + }, + { + "epoch": 2.5163398692810457, + "grad_norm": 0.7813326120376587, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 3850 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.5390260219573975, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 3860 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.8283252716064453, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3870 + }, + { + "epoch": 2.5359477124183005, + "grad_norm": 0.8527186512947083, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 3880 + }, + { + "epoch": 2.542483660130719, + "grad_norm": 0.8405382633209229, + "learning_rate": 0.0002, + "loss": 1.3469, + "step": 3890 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.5650738477706909, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 3900 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.620121955871582, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3910 + }, + { + "epoch": 2.5620915032679736, + "grad_norm": 0.5983527898788452, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3920 + }, + { + "epoch": 2.568627450980392, + "grad_norm": 0.686623215675354, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 3930 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.6805831789970398, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 3940 + }, + { + "epoch": 2.581699346405229, + "grad_norm": 0.6994825601577759, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3950 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.728549599647522, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 3960 + }, + { + "epoch": 2.5947712418300655, + "grad_norm": 0.775236964225769, + "learning_rate": 0.0002, + "loss": 1.4039, + "step": 3970 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.5057447552680969, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3980 + }, + { + "epoch": 2.607843137254902, + "grad_norm": 0.6564450263977051, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 3990 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5342249870300293, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 4000 + }, + { + "epoch": 2.6209150326797386, + "grad_norm": 0.5508961081504822, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4010 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.5716235637664795, + "learning_rate": 0.0002, + "loss": 1.3636, + "step": 4020 + }, + { + "epoch": 2.633986928104575, + "grad_norm": 0.8049232363700867, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 4030 + }, + { + "epoch": 2.6405228758169934, + "grad_norm": 0.5574354529380798, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 4040 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.6302093863487244, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 4050 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 1.1868736743927002, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 4060 + }, + { + "epoch": 2.6601307189542482, + "grad_norm": 0.6738120317459106, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 4070 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6614423990249634, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 4080 + }, + { + "epoch": 2.6732026143790852, + "grad_norm": 0.7297604084014893, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 4090 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.9421682357788086, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4100 + }, + { + "epoch": 2.686274509803922, + "grad_norm": 0.5286222696304321, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 4110 + }, + { + "epoch": 2.69281045751634, + "grad_norm": 0.6849271655082703, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 4120 + }, + { + "epoch": 2.6993464052287583, + "grad_norm": 0.6811320185661316, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 4130 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.4968419373035431, + "learning_rate": 0.0002, + "loss": 1.2897, + "step": 4140 + }, + { + "epoch": 2.712418300653595, + "grad_norm": 0.8074267506599426, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 4150 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.6756376028060913, + "learning_rate": 0.0002, + "loss": 1.1759, + "step": 4160 + }, + { + "epoch": 2.7254901960784315, + "grad_norm": 0.6921583414077759, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4170 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.7049834132194519, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 4180 + }, + { + "epoch": 2.738562091503268, + "grad_norm": 0.7011390328407288, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4190 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.6977843642234802, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 4200 + }, + { + "epoch": 2.7516339869281046, + "grad_norm": 0.6717000603675842, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 4210 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 1.0223724842071533, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 4220 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.6573330760002136, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4230 + }, + { + "epoch": 2.7712418300653594, + "grad_norm": 0.6684938073158264, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 4240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.7426793575286865, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 4250 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.557826578617096, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 4260 + }, + { + "epoch": 2.7908496732026142, + "grad_norm": 0.6669870018959045, + "learning_rate": 0.0002, + "loss": 1.3262, + "step": 4270 + }, + { + "epoch": 2.7973856209150325, + "grad_norm": 0.5349969267845154, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 4280 + }, + { + "epoch": 2.803921568627451, + "grad_norm": 0.7262802124023438, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4290 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.768211841583252, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 4300 + }, + { + "epoch": 2.8169934640522873, + "grad_norm": 0.5958252549171448, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 4310 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.8451310396194458, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4320 + }, + { + "epoch": 2.8300653594771243, + "grad_norm": 0.6544435024261475, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 4330 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.6177433133125305, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 4340 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 0.6324988007545471, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4350 + }, + { + "epoch": 2.849673202614379, + "grad_norm": 0.6884300708770752, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 4360 + }, + { + "epoch": 2.8562091503267975, + "grad_norm": 0.8952897191047668, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 4370 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 1.0260103940963745, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4380 + }, + { + "epoch": 2.869281045751634, + "grad_norm": 0.9134647250175476, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4390 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.5637717843055725, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 4400 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.7530393004417419, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 4410 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7202680706977844, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 4420 + }, + { + "epoch": 2.895424836601307, + "grad_norm": 0.7177144885063171, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4430 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.5996816754341125, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 4440 + }, + { + "epoch": 2.9084967320261437, + "grad_norm": 0.6542447209358215, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 4450 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 1.0753740072250366, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4460 + }, + { + "epoch": 2.9215686274509802, + "grad_norm": 0.6956136226654053, + "learning_rate": 0.0002, + "loss": 1.3193, + "step": 4470 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.7702530026435852, + "learning_rate": 0.0002, + "loss": 1.2486, + "step": 4480 + }, + { + "epoch": 2.9346405228758172, + "grad_norm": 0.7763232588768005, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 4490 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.6393085718154907, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 4500 + }, + { + "epoch": 2.947712418300654, + "grad_norm": 0.987770676612854, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 4510 + }, + { + "epoch": 2.954248366013072, + "grad_norm": 0.5995016098022461, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 4520 + }, + { + "epoch": 2.9607843137254903, + "grad_norm": 0.745650053024292, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 4530 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.7429282069206238, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4540 + }, + { + "epoch": 2.973856209150327, + "grad_norm": 0.5927486419677734, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4550 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.6775153875350952, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 4560 + }, + { + "epoch": 2.9869281045751634, + "grad_norm": 0.7128435373306274, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 4570 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.7470937967300415, + "learning_rate": 0.0002, + "loss": 1.2451, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295375943183899, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 4590 + }, + { + "epoch": 3.0, + "eval_loss": 1.4131312370300293, + "eval_runtime": 31.8967, + "eval_samples_per_second": 13.669, + "eval_steps_per_second": 1.724, + "step": 4590 + }, + { + "epoch": 3.0065359477124183, + "grad_norm": 0.6926420331001282, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4600 + }, + { + "epoch": 3.0130718954248366, + "grad_norm": 0.6656355857849121, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 4610 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.9901936650276184, + "learning_rate": 0.0002, + "loss": 1.308, + "step": 4620 + }, + { + "epoch": 3.026143790849673, + "grad_norm": 0.6713474988937378, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 4630 + }, + { + "epoch": 3.0326797385620914, + "grad_norm": 0.6199324131011963, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 4640 + }, + { + "epoch": 3.0392156862745097, + "grad_norm": 0.7180785536766052, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 4650 + }, + { + "epoch": 3.045751633986928, + "grad_norm": 0.8256588578224182, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 4660 + }, + { + "epoch": 3.052287581699346, + "grad_norm": 0.6637389063835144, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 4670 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.6980698108673096, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 4680 + }, + { + "epoch": 3.065359477124183, + "grad_norm": 0.8091534972190857, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 4690 + }, + { + "epoch": 3.0718954248366015, + "grad_norm": 0.5715174078941345, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 4700 + }, + { + "epoch": 3.0784313725490198, + "grad_norm": 0.735639750957489, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 4710 + }, + { + "epoch": 3.084967320261438, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.0002, + "loss": 1.1522, + "step": 4720 + }, + { + "epoch": 3.0915032679738563, + "grad_norm": 1.263566017150879, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 4730 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.6600871682167053, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4740 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 0.717792809009552, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 4750 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.853714644908905, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 4760 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.1004153490066528, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 4770 + }, + { + "epoch": 3.1241830065359477, + "grad_norm": 0.8566235899925232, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 4780 + }, + { + "epoch": 3.130718954248366, + "grad_norm": 0.8315296173095703, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 4790 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8020524978637695, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 4800 + }, + { + "epoch": 3.1437908496732025, + "grad_norm": 0.7564275860786438, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 4810 + }, + { + "epoch": 3.150326797385621, + "grad_norm": 0.9077776670455933, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 4820 + }, + { + "epoch": 3.156862745098039, + "grad_norm": 0.6323099732398987, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 4830 + }, + { + "epoch": 3.1633986928104574, + "grad_norm": 0.6625368595123291, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 3.1699346405228757, + "grad_norm": 0.8119261860847473, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 4850 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.6399450898170471, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 4860 + }, + { + "epoch": 3.183006535947712, + "grad_norm": 1.0659016370773315, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 4870 + }, + { + "epoch": 3.189542483660131, + "grad_norm": 0.8040369749069214, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4880 + }, + { + "epoch": 3.196078431372549, + "grad_norm": 0.7784733176231384, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4890 + }, + { + "epoch": 3.2026143790849675, + "grad_norm": 0.9660294651985168, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4900 + }, + { + "epoch": 3.2091503267973858, + "grad_norm": 1.0676977634429932, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 4910 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.5877565741539001, + "learning_rate": 0.0002, + "loss": 1.0083, + "step": 4920 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.6164032816886902, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 4930 + }, + { + "epoch": 3.2287581699346406, + "grad_norm": 0.7627606987953186, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 4940 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 0.7442803978919983, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 4950 + }, + { + "epoch": 3.241830065359477, + "grad_norm": 0.7277812361717224, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 4960 + }, + { + "epoch": 3.2483660130718954, + "grad_norm": 1.0301902294158936, + "learning_rate": 0.0002, + "loss": 1.2237, + "step": 4970 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.7798232436180115, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 4980 + }, + { + "epoch": 3.261437908496732, + "grad_norm": 1.210265874862671, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 4990 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 0.6677713990211487, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 5000 + }, + { + "epoch": 3.2745098039215685, + "grad_norm": 1.0524500608444214, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 5010 + }, + { + "epoch": 3.281045751633987, + "grad_norm": 0.7091745734214783, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5020 + }, + { + "epoch": 3.287581699346405, + "grad_norm": 0.8523224592208862, + "learning_rate": 0.0002, + "loss": 1.1891, + "step": 5030 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.6120608448982239, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 5040 + }, + { + "epoch": 3.3006535947712417, + "grad_norm": 0.7437472939491272, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 5050 + }, + { + "epoch": 3.30718954248366, + "grad_norm": 0.7611715197563171, + "learning_rate": 0.0002, + "loss": 1.1295, + "step": 5060 + }, + { + "epoch": 3.313725490196078, + "grad_norm": 0.7249704003334045, + "learning_rate": 0.0002, + "loss": 1.0531, + "step": 5070 + }, + { + "epoch": 3.3202614379084965, + "grad_norm": 0.7316247820854187, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5080 + }, + { + "epoch": 3.326797385620915, + "grad_norm": 0.562412440776825, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5090 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7052176594734192, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5100 + }, + { + "epoch": 3.3398692810457518, + "grad_norm": 0.7714211344718933, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 5110 + }, + { + "epoch": 3.34640522875817, + "grad_norm": 1.0436055660247803, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5120 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.8867271542549133, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 5130 + }, + { + "epoch": 3.3594771241830066, + "grad_norm": 0.8371267914772034, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 5140 + }, + { + "epoch": 3.366013071895425, + "grad_norm": 0.7257837057113647, + "learning_rate": 0.0002, + "loss": 1.1073, + "step": 5150 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.7102002501487732, + "learning_rate": 0.0002, + "loss": 1.1162, + "step": 5160 + }, + { + "epoch": 3.3790849673202614, + "grad_norm": 0.7636350393295288, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 5170 + }, + { + "epoch": 3.3856209150326797, + "grad_norm": 0.6887359619140625, + "learning_rate": 0.0002, + "loss": 1.0708, + "step": 5180 + }, + { + "epoch": 3.392156862745098, + "grad_norm": 0.8141424655914307, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 5190 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.694423496723175, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5200 + }, + { + "epoch": 3.4052287581699345, + "grad_norm": 0.914013683795929, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5210 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.8503239750862122, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 5220 + }, + { + "epoch": 3.418300653594771, + "grad_norm": 0.6196836233139038, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 5230 + }, + { + "epoch": 3.4248366013071894, + "grad_norm": 1.0760811567306519, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 5240 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 0.6524698138237, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 5250 + }, + { + "epoch": 3.4379084967320264, + "grad_norm": 0.674467921257019, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5260 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7690372467041016, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 5270 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.8751813769340515, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 5280 + }, + { + "epoch": 3.457516339869281, + "grad_norm": 0.750407874584198, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 5290 + }, + { + "epoch": 3.4640522875816995, + "grad_norm": 0.5991823077201843, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 5300 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.0164772272109985, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 5310 + }, + { + "epoch": 3.477124183006536, + "grad_norm": 0.8704105019569397, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 5320 + }, + { + "epoch": 3.4836601307189543, + "grad_norm": 0.709102213382721, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 5330 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.6273632049560547, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 5340 + }, + { + "epoch": 3.496732026143791, + "grad_norm": 0.6807359457015991, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 5350 + }, + { + "epoch": 3.503267973856209, + "grad_norm": 0.7085188627243042, + "learning_rate": 0.0002, + "loss": 1.131, + "step": 5360 + }, + { + "epoch": 3.5098039215686274, + "grad_norm": 0.6938307881355286, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5370 + }, + { + "epoch": 3.5163398692810457, + "grad_norm": 0.8544146418571472, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 5380 + }, + { + "epoch": 3.522875816993464, + "grad_norm": 0.7889642119407654, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 5390 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.7858421206474304, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 5400 + }, + { + "epoch": 3.5359477124183005, + "grad_norm": 0.8547123074531555, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5410 + }, + { + "epoch": 3.542483660130719, + "grad_norm": 0.8218181133270264, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 5420 + }, + { + "epoch": 3.549019607843137, + "grad_norm": 1.153623342514038, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 5430 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.1321099996566772, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 5440 + }, + { + "epoch": 3.5620915032679736, + "grad_norm": 0.9495334029197693, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 5450 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.8743821978569031, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 5460 + }, + { + "epoch": 3.57516339869281, + "grad_norm": 0.7513086795806885, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 5470 + }, + { + "epoch": 3.581699346405229, + "grad_norm": 1.0139480829238892, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 5480 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 0.6615135073661804, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 5490 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 1.180798888206482, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 5500 + }, + { + "epoch": 3.6013071895424837, + "grad_norm": 0.7085279226303101, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 5510 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.540268063545227, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 5520 + }, + { + "epoch": 3.6143790849673203, + "grad_norm": 0.7905671000480652, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 5530 + }, + { + "epoch": 3.6209150326797386, + "grad_norm": 0.8457717299461365, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 5540 + }, + { + "epoch": 3.627450980392157, + "grad_norm": 0.7102677822113037, + "learning_rate": 0.0002, + "loss": 1.1799, + "step": 5550 + }, + { + "epoch": 3.633986928104575, + "grad_norm": 0.7179514765739441, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 5560 + }, + { + "epoch": 3.6405228758169934, + "grad_norm": 1.0854148864746094, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 5570 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.8209951519966125, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 5580 + }, + { + "epoch": 3.65359477124183, + "grad_norm": 0.6944138407707214, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 5590 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.7675473093986511, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 5600 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 5610 + }, + { + "epoch": 3.6732026143790852, + "grad_norm": 0.7920727133750916, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 5620 + }, + { + "epoch": 3.6797385620915035, + "grad_norm": 0.9440218806266785, + "learning_rate": 0.0002, + "loss": 1.2287, + "step": 5630 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.6600824594497681, + "learning_rate": 0.0002, + "loss": 1.2444, + "step": 5640 + }, + { + "epoch": 3.69281045751634, + "grad_norm": 0.6860619187355042, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 5650 + }, + { + "epoch": 3.6993464052287583, + "grad_norm": 0.6579713225364685, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 5660 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.661081075668335, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 5670 + }, + { + "epoch": 3.712418300653595, + "grad_norm": 1.0968825817108154, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 5680 + }, + { + "epoch": 3.718954248366013, + "grad_norm": 0.8066844940185547, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 5690 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.8341682553291321, + "learning_rate": 0.0002, + "loss": 1.2322, + "step": 5700 + }, + { + "epoch": 3.7320261437908497, + "grad_norm": 0.6682852506637573, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 5710 + }, + { + "epoch": 3.738562091503268, + "grad_norm": 0.898595929145813, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 5720 + }, + { + "epoch": 3.7450980392156863, + "grad_norm": 0.6876054406166077, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 5730 + }, + { + "epoch": 3.7516339869281046, + "grad_norm": 0.7817103266716003, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 5740 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 0.5840168595314026, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 5750 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.6263918876647949, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5760 + }, + { + "epoch": 3.7712418300653594, + "grad_norm": 0.7948952317237854, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 5770 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6700998544692993, + "learning_rate": 0.0002, + "loss": 1.149, + "step": 5780 + }, + { + "epoch": 3.784313725490196, + "grad_norm": 1.1169519424438477, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 5790 + }, + { + "epoch": 3.7908496732026142, + "grad_norm": 0.8354471325874329, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5800 + }, + { + "epoch": 3.7973856209150325, + "grad_norm": 0.6304181814193726, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 5810 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.6919655799865723, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 5820 + }, + { + "epoch": 3.810457516339869, + "grad_norm": 0.600385844707489, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5830 + }, + { + "epoch": 3.8169934640522873, + "grad_norm": 0.8406319618225098, + "learning_rate": 0.0002, + "loss": 1.2324, + "step": 5840 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 0.7594282031059265, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5850 + }, + { + "epoch": 3.8300653594771243, + "grad_norm": 0.8179879784584045, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 5860 + }, + { + "epoch": 3.8366013071895426, + "grad_norm": 1.141430377960205, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 5870 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.6595550775527954, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 5880 + }, + { + "epoch": 3.849673202614379, + "grad_norm": 0.7499435544013977, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 5890 + }, + { + "epoch": 3.8562091503267975, + "grad_norm": 0.7851517200469971, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 5900 + }, + { + "epoch": 3.8627450980392157, + "grad_norm": 1.0533545017242432, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 5910 + }, + { + "epoch": 3.869281045751634, + "grad_norm": 0.960086464881897, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5920 + }, + { + "epoch": 3.8758169934640523, + "grad_norm": 0.9952049851417542, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 5930 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.7884191274642944, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 5940 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.7461766600608826, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5950 + }, + { + "epoch": 3.895424836601307, + "grad_norm": 0.9594355821609497, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 5960 + }, + { + "epoch": 3.9019607843137254, + "grad_norm": 0.8179471492767334, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 5970 + }, + { + "epoch": 3.9084967320261437, + "grad_norm": 0.8240267634391785, + "learning_rate": 0.0002, + "loss": 1.2421, + "step": 5980 + }, + { + "epoch": 3.915032679738562, + "grad_norm": 0.7462618350982666, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 5990 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.711207389831543, + "learning_rate": 0.0002, + "loss": 1.2124, + "step": 6000 + }, + { + "epoch": 3.928104575163399, + "grad_norm": 0.6910956501960754, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 6010 + }, + { + "epoch": 3.9346405228758172, + "grad_norm": 0.749093770980835, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 6020 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.3332762718200684, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6030 + }, + { + "epoch": 3.947712418300654, + "grad_norm": 0.71457439661026, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6040 + }, + { + "epoch": 3.954248366013072, + "grad_norm": 1.1205238103866577, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 6050 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.6958928108215332, + "learning_rate": 0.0002, + "loss": 1.2962, + "step": 6060 + }, + { + "epoch": 3.9673202614379086, + "grad_norm": 0.7518056035041809, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 6070 + }, + { + "epoch": 3.973856209150327, + "grad_norm": 0.8010755777359009, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 6080 + }, + { + "epoch": 3.980392156862745, + "grad_norm": 0.7492658495903015, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6090 + }, + { + "epoch": 3.9869281045751634, + "grad_norm": 0.900704562664032, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 6100 + }, + { + "epoch": 3.9934640522875817, + "grad_norm": 0.7997331619262695, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 6110 + }, + { + "epoch": 4.0, + "grad_norm": 0.7163209319114685, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 6120 + }, + { + "epoch": 4.0, + "eval_loss": 1.4113320112228394, + "eval_runtime": 33.7199, + "eval_samples_per_second": 12.93, + "eval_steps_per_second": 1.631, + "step": 6120 + }, + { + "epoch": 4.006535947712418, + "grad_norm": 0.9527022838592529, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 6130 + }, + { + "epoch": 4.0130718954248366, + "grad_norm": 0.7603210210800171, + "learning_rate": 0.0002, + "loss": 1.101, + "step": 6140 + }, + { + "epoch": 4.019607843137255, + "grad_norm": 1.127387523651123, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 6150 + }, + { + "epoch": 4.026143790849673, + "grad_norm": 0.8290133476257324, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 6160 + }, + { + "epoch": 4.032679738562091, + "grad_norm": 0.9912241101264954, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 6170 + }, + { + "epoch": 4.03921568627451, + "grad_norm": 0.947005033493042, + "learning_rate": 0.0002, + "loss": 1.0719, + "step": 6180 + }, + { + "epoch": 4.045751633986928, + "grad_norm": 0.707466185092926, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 6190 + }, + { + "epoch": 4.052287581699346, + "grad_norm": 1.0604327917099, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6200 + }, + { + "epoch": 4.0588235294117645, + "grad_norm": 0.7848685383796692, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6210 + }, + { + "epoch": 4.065359477124183, + "grad_norm": 0.8475256562232971, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 6220 + }, + { + "epoch": 4.071895424836601, + "grad_norm": 0.9759448766708374, + "learning_rate": 0.0002, + "loss": 1.1104, + "step": 6230 + }, + { + "epoch": 4.078431372549019, + "grad_norm": 0.9324519038200378, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 6240 + }, + { + "epoch": 4.084967320261438, + "grad_norm": 0.8723901510238647, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 6250 + }, + { + "epoch": 4.091503267973856, + "grad_norm": 0.8343415856361389, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6260 + }, + { + "epoch": 4.098039215686274, + "grad_norm": 0.7490310072898865, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 6270 + }, + { + "epoch": 4.104575163398692, + "grad_norm": 0.8961182832717896, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 6280 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7124854922294617, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 6290 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.8338138461112976, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 6300 + }, + { + "epoch": 4.124183006535947, + "grad_norm": 0.8075833320617676, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 6310 + }, + { + "epoch": 4.130718954248366, + "grad_norm": 0.8069391846656799, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 6320 + }, + { + "epoch": 4.137254901960785, + "grad_norm": 0.9567893147468567, + "learning_rate": 0.0002, + "loss": 0.948, + "step": 6330 + }, + { + "epoch": 4.143790849673203, + "grad_norm": 1.2184662818908691, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 6340 + }, + { + "epoch": 4.150326797385621, + "grad_norm": 1.030976414680481, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 6350 + }, + { + "epoch": 4.1568627450980395, + "grad_norm": 0.9749957323074341, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6360 + }, + { + "epoch": 4.163398692810458, + "grad_norm": 0.7089483141899109, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 6370 + }, + { + "epoch": 4.169934640522876, + "grad_norm": 1.1084946393966675, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 6380 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 0.7998497486114502, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 6390 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 0.8997811675071716, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 6400 + }, + { + "epoch": 4.189542483660131, + "grad_norm": 0.8359479904174805, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 6410 + }, + { + "epoch": 4.196078431372549, + "grad_norm": 0.9087472558021545, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 6420 + }, + { + "epoch": 4.2026143790849675, + "grad_norm": 1.1100451946258545, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 6430 + }, + { + "epoch": 4.209150326797386, + "grad_norm": 0.9376999735832214, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 6440 + }, + { + "epoch": 4.215686274509804, + "grad_norm": 0.8179266452789307, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 6450 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9953271746635437, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 6460 + }, + { + "epoch": 4.228758169934641, + "grad_norm": 0.8476650714874268, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6470 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.8406323194503784, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6480 + }, + { + "epoch": 4.241830065359477, + "grad_norm": 0.819134533405304, + "learning_rate": 0.0002, + "loss": 1.057, + "step": 6490 + }, + { + "epoch": 4.248366013071895, + "grad_norm": 0.7764983773231506, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 6500 + }, + { + "epoch": 4.254901960784314, + "grad_norm": 0.8252112865447998, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 6510 + }, + { + "epoch": 4.261437908496732, + "grad_norm": 0.7941019535064697, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6520 + }, + { + "epoch": 4.26797385620915, + "grad_norm": 0.7673905491828918, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 6530 + }, + { + "epoch": 4.2745098039215685, + "grad_norm": 0.8749890327453613, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6540 + }, + { + "epoch": 4.281045751633987, + "grad_norm": 0.7343207597732544, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 4.287581699346405, + "grad_norm": 1.2786651849746704, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6560 + }, + { + "epoch": 4.294117647058823, + "grad_norm": 1.316875696182251, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6570 + }, + { + "epoch": 4.300653594771242, + "grad_norm": 0.8349189162254333, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 6580 + }, + { + "epoch": 4.30718954248366, + "grad_norm": 0.7510647177696228, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6590 + }, + { + "epoch": 4.313725490196078, + "grad_norm": 0.932420551776886, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 6600 + }, + { + "epoch": 4.3202614379084965, + "grad_norm": 0.8510616421699524, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 6610 + }, + { + "epoch": 4.326797385620915, + "grad_norm": 0.7661547064781189, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6620 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.0370930433273315, + "learning_rate": 0.0002, + "loss": 1.2064, + "step": 6630 + }, + { + "epoch": 4.339869281045751, + "grad_norm": 0.9302158951759338, + "learning_rate": 0.0002, + "loss": 1.1064, + "step": 6640 + }, + { + "epoch": 4.34640522875817, + "grad_norm": 0.9203811883926392, + "learning_rate": 0.0002, + "loss": 0.968, + "step": 6650 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.9986332654953003, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 6660 + }, + { + "epoch": 4.359477124183006, + "grad_norm": 0.8001713156700134, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6670 + }, + { + "epoch": 4.366013071895424, + "grad_norm": 0.829714298248291, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 6680 + }, + { + "epoch": 4.372549019607844, + "grad_norm": 0.8253079056739807, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 6690 + }, + { + "epoch": 4.379084967320262, + "grad_norm": 0.824666440486908, + "learning_rate": 0.0002, + "loss": 1.1087, + "step": 6700 + }, + { + "epoch": 4.38562091503268, + "grad_norm": 0.8872972130775452, + "learning_rate": 0.0002, + "loss": 1.1968, + "step": 6710 + }, + { + "epoch": 4.392156862745098, + "grad_norm": 0.8729761838912964, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 6720 + }, + { + "epoch": 4.398692810457517, + "grad_norm": 1.1367264986038208, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 6730 + }, + { + "epoch": 4.405228758169935, + "grad_norm": 0.9699058532714844, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 6740 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 0.8266763687133789, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 6750 + }, + { + "epoch": 4.4183006535947715, + "grad_norm": 1.0249767303466797, + "learning_rate": 0.0002, + "loss": 1.0735, + "step": 6760 + }, + { + "epoch": 4.42483660130719, + "grad_norm": 0.73606938123703, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 6770 + }, + { + "epoch": 4.431372549019608, + "grad_norm": 1.4050679206848145, + "learning_rate": 0.0002, + "loss": 1.1037, + "step": 6780 + }, + { + "epoch": 4.437908496732026, + "grad_norm": 1.1114081144332886, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6790 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8031067848205566, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 6800 + }, + { + "epoch": 4.450980392156863, + "grad_norm": 0.8513566851615906, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6810 + }, + { + "epoch": 4.457516339869281, + "grad_norm": 1.332741379737854, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 6820 + }, + { + "epoch": 4.4640522875816995, + "grad_norm": 1.5032578706741333, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 6830 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.7677283883094788, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 6840 + }, + { + "epoch": 4.477124183006536, + "grad_norm": 0.989148736000061, + "learning_rate": 0.0002, + "loss": 1.1501, + "step": 6850 + }, + { + "epoch": 4.483660130718954, + "grad_norm": 1.5316275358200073, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6860 + }, + { + "epoch": 4.490196078431373, + "grad_norm": 0.9427124261856079, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 6870 + }, + { + "epoch": 4.496732026143791, + "grad_norm": 1.215287685394287, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 6880 + }, + { + "epoch": 4.503267973856209, + "grad_norm": 0.7286760210990906, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 6890 + }, + { + "epoch": 4.509803921568627, + "grad_norm": 0.874829888343811, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6900 + }, + { + "epoch": 4.516339869281046, + "grad_norm": 0.8058359622955322, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 6910 + }, + { + "epoch": 4.522875816993464, + "grad_norm": 1.248195767402649, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 6920 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 0.8033645749092102, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 6930 + }, + { + "epoch": 4.5359477124183005, + "grad_norm": 1.7361950874328613, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 6940 + }, + { + "epoch": 4.542483660130719, + "grad_norm": 0.8058095574378967, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 6950 + }, + { + "epoch": 4.549019607843137, + "grad_norm": 1.254089593887329, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 6960 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.9180455803871155, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 6970 + }, + { + "epoch": 4.562091503267974, + "grad_norm": 0.6677682399749756, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 6980 + }, + { + "epoch": 4.568627450980392, + "grad_norm": 0.8127354383468628, + "learning_rate": 0.0002, + "loss": 1.0453, + "step": 6990 + }, + { + "epoch": 4.57516339869281, + "grad_norm": 1.0263001918792725, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7000 + }, + { + "epoch": 4.5816993464052285, + "grad_norm": 0.9641909003257751, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 7010 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 0.9440861344337463, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 7020 + }, + { + "epoch": 4.594771241830065, + "grad_norm": 0.9539011716842651, + "learning_rate": 0.0002, + "loss": 1.0931, + "step": 7030 + }, + { + "epoch": 4.601307189542483, + "grad_norm": 1.0449910163879395, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7040 + }, + { + "epoch": 4.607843137254902, + "grad_norm": 0.8766893744468689, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7050 + }, + { + "epoch": 4.61437908496732, + "grad_norm": 0.6983462572097778, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 7060 + }, + { + "epoch": 4.620915032679738, + "grad_norm": 0.9505505561828613, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 7070 + }, + { + "epoch": 4.627450980392156, + "grad_norm": 1.2506657838821411, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7080 + }, + { + "epoch": 4.633986928104575, + "grad_norm": 0.9602801203727722, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 7090 + }, + { + "epoch": 4.640522875816993, + "grad_norm": 0.7398977875709534, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7100 + }, + { + "epoch": 4.647058823529412, + "grad_norm": 1.3862425088882446, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 7110 + }, + { + "epoch": 4.65359477124183, + "grad_norm": 1.1451990604400635, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 7120 + }, + { + "epoch": 4.660130718954249, + "grad_norm": 0.9010422229766846, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 7130 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.7102518081665039, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 7140 + }, + { + "epoch": 4.673202614379085, + "grad_norm": 0.7963796257972717, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7150 + }, + { + "epoch": 4.6797385620915035, + "grad_norm": 0.7726007699966431, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 7160 + }, + { + "epoch": 4.686274509803922, + "grad_norm": 0.8097564578056335, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 7170 + }, + { + "epoch": 4.69281045751634, + "grad_norm": 0.9070925116539001, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 7180 + }, + { + "epoch": 4.699346405228758, + "grad_norm": 0.7543528079986572, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 7190 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9900904893875122, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7200 + }, + { + "epoch": 4.712418300653595, + "grad_norm": 0.8033412098884583, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 7210 + }, + { + "epoch": 4.718954248366013, + "grad_norm": 0.8440839052200317, + "learning_rate": 0.0002, + "loss": 1.1773, + "step": 7220 + }, + { + "epoch": 4.7254901960784315, + "grad_norm": 0.9325555562973022, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 7230 + }, + { + "epoch": 4.73202614379085, + "grad_norm": 0.7881146669387817, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7240 + }, + { + "epoch": 4.738562091503268, + "grad_norm": 0.884453296661377, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7250 + }, + { + "epoch": 4.745098039215686, + "grad_norm": 0.9274539351463318, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7260 + }, + { + "epoch": 4.751633986928105, + "grad_norm": 1.2367479801177979, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7270 + }, + { + "epoch": 4.758169934640523, + "grad_norm": 0.9499821066856384, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7280 + }, + { + "epoch": 4.764705882352941, + "grad_norm": 2.1918580532073975, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 7290 + }, + { + "epoch": 4.771241830065359, + "grad_norm": 0.8221880793571472, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 7300 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.871972918510437, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7310 + }, + { + "epoch": 4.784313725490196, + "grad_norm": 0.8034510612487793, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 7320 + }, + { + "epoch": 4.790849673202614, + "grad_norm": 0.8959605693817139, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 7330 + }, + { + "epoch": 4.7973856209150325, + "grad_norm": 1.2326215505599976, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7340 + }, + { + "epoch": 4.803921568627451, + "grad_norm": 0.9725791811943054, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7350 + }, + { + "epoch": 4.810457516339869, + "grad_norm": 0.7240816354751587, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 7360 + }, + { + "epoch": 4.816993464052287, + "grad_norm": 0.8265769481658936, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 7370 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 0.8888696432113647, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 7380 + }, + { + "epoch": 4.830065359477124, + "grad_norm": 0.7776556015014648, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 7390 + }, + { + "epoch": 4.836601307189542, + "grad_norm": 0.8772371411323547, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7400 + }, + { + "epoch": 4.8431372549019605, + "grad_norm": 0.9786531925201416, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7410 + }, + { + "epoch": 4.849673202614379, + "grad_norm": 0.9059745073318481, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 7420 + }, + { + "epoch": 4.856209150326797, + "grad_norm": 0.7422552108764648, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 7430 + }, + { + "epoch": 4.862745098039216, + "grad_norm": 1.3040380477905273, + "learning_rate": 0.0002, + "loss": 1.0423, + "step": 7440 + }, + { + "epoch": 4.8692810457516345, + "grad_norm": 1.3278473615646362, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 7450 + }, + { + "epoch": 4.875816993464053, + "grad_norm": 1.2705849409103394, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 7460 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 0.8837892413139343, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 7470 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8670691251754761, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 7480 + }, + { + "epoch": 4.895424836601308, + "grad_norm": 0.9662758111953735, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 7490 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 0.8188302516937256, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 7500 + }, + { + "epoch": 4.908496732026144, + "grad_norm": 0.769442617893219, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 7510 + }, + { + "epoch": 4.915032679738562, + "grad_norm": 1.1465084552764893, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 7520 + }, + { + "epoch": 4.921568627450981, + "grad_norm": 1.253214955329895, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 7530 + }, + { + "epoch": 4.928104575163399, + "grad_norm": 0.7922375202178955, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 7540 + }, + { + "epoch": 4.934640522875817, + "grad_norm": 0.8306851387023926, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7550 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.8486151099205017, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 7560 + }, + { + "epoch": 4.947712418300654, + "grad_norm": 1.2601467370986938, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7570 + }, + { + "epoch": 4.954248366013072, + "grad_norm": 0.7980747818946838, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 7580 + }, + { + "epoch": 4.96078431372549, + "grad_norm": 0.8653254508972168, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 7590 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 0.9680571556091309, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 7600 + }, + { + "epoch": 4.973856209150327, + "grad_norm": 0.9554466605186462, + "learning_rate": 0.0002, + "loss": 1.1795, + "step": 7610 + }, + { + "epoch": 4.980392156862745, + "grad_norm": 1.3693897724151611, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 7620 + }, + { + "epoch": 4.9869281045751634, + "grad_norm": 0.7809282541275024, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 7630 + }, + { + "epoch": 4.993464052287582, + "grad_norm": 0.7528006434440613, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 7640 + }, + { + "epoch": 5.0, + "grad_norm": 1.7491309642791748, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 7650 + }, + { + "epoch": 5.0, + "eval_loss": 1.4197258949279785, + "eval_runtime": 33.6327, + "eval_samples_per_second": 12.964, + "eval_steps_per_second": 1.635, + "step": 7650 + }, + { + "epoch": 5.006535947712418, + "grad_norm": 0.8840063214302063, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 7660 + }, + { + "epoch": 5.0130718954248366, + "grad_norm": 1.0118401050567627, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 7670 + }, + { + "epoch": 5.019607843137255, + "grad_norm": 1.0040518045425415, + "learning_rate": 0.0002, + "loss": 1.1667, + "step": 7680 + }, + { + "epoch": 5.026143790849673, + "grad_norm": 0.7541199922561646, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 7690 + }, + { + "epoch": 5.032679738562091, + "grad_norm": 0.9106482863426208, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 7700 + }, + { + "epoch": 5.03921568627451, + "grad_norm": 1.3691469430923462, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 7710 + }, + { + "epoch": 5.045751633986928, + "grad_norm": 0.9449689388275146, + "learning_rate": 0.0002, + "loss": 0.9889, + "step": 7720 + }, + { + "epoch": 5.052287581699346, + "grad_norm": 1.1678508520126343, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 7730 + }, + { + "epoch": 5.0588235294117645, + "grad_norm": 1.1296145915985107, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7740 + }, + { + "epoch": 5.065359477124183, + "grad_norm": 0.7863904237747192, + "learning_rate": 0.0002, + "loss": 0.9339, + "step": 7750 + }, + { + "epoch": 5.071895424836601, + "grad_norm": 0.8691433072090149, + "learning_rate": 0.0002, + "loss": 1.0135, + "step": 7760 + }, + { + "epoch": 5.078431372549019, + "grad_norm": 1.0722088813781738, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 7770 + }, + { + "epoch": 5.084967320261438, + "grad_norm": 0.9625038504600525, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 7780 + }, + { + "epoch": 5.091503267973856, + "grad_norm": 1.2618783712387085, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 7790 + }, + { + "epoch": 5.098039215686274, + "grad_norm": 0.9970650672912598, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 7800 + }, + { + "epoch": 5.104575163398692, + "grad_norm": 1.3946677446365356, + "learning_rate": 0.0002, + "loss": 0.9186, + "step": 7810 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.0260052680969238, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 7820 + }, + { + "epoch": 5.117647058823529, + "grad_norm": 1.105521559715271, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 7830 + }, + { + "epoch": 5.124183006535947, + "grad_norm": 1.003641128540039, + "learning_rate": 0.0002, + "loss": 0.9788, + "step": 7840 + }, + { + "epoch": 5.130718954248366, + "grad_norm": 1.0315021276474, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 7850 + }, + { + "epoch": 5.137254901960785, + "grad_norm": 0.9469530582427979, + "learning_rate": 0.0002, + "loss": 1.0001, + "step": 7860 + }, + { + "epoch": 5.143790849673203, + "grad_norm": 1.3244667053222656, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7870 + }, + { + "epoch": 5.150326797385621, + "grad_norm": 1.1732033491134644, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 7880 + }, + { + "epoch": 5.1568627450980395, + "grad_norm": 1.3129149675369263, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 7890 + }, + { + "epoch": 5.163398692810458, + "grad_norm": 0.8589454293251038, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7900 + }, + { + "epoch": 5.169934640522876, + "grad_norm": 0.8954233527183533, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 7910 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 0.7426522970199585, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 7920 + }, + { + "epoch": 5.183006535947713, + "grad_norm": 1.1990121603012085, + "learning_rate": 0.0002, + "loss": 1.0106, + "step": 7930 + }, + { + "epoch": 5.189542483660131, + "grad_norm": 0.8867580890655518, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 7940 + }, + { + "epoch": 5.196078431372549, + "grad_norm": 1.016276478767395, + "learning_rate": 0.0002, + "loss": 0.9727, + "step": 7950 + }, + { + "epoch": 5.2026143790849675, + "grad_norm": 1.0210685729980469, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 7960 + }, + { + "epoch": 5.209150326797386, + "grad_norm": 1.0093122720718384, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7970 + }, + { + "epoch": 5.215686274509804, + "grad_norm": 0.9746801853179932, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 7980 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.9113537073135376, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 7990 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 1.2782206535339355, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 8000 + }, + { + "epoch": 5.235294117647059, + "grad_norm": 1.3223118782043457, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 8010 + }, + { + "epoch": 5.241830065359477, + "grad_norm": 0.7898629307746887, + "learning_rate": 0.0002, + "loss": 0.9244, + "step": 8020 + }, + { + "epoch": 5.248366013071895, + "grad_norm": 0.9822350740432739, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 8030 + }, + { + "epoch": 5.254901960784314, + "grad_norm": 1.5114340782165527, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 8040 + }, + { + "epoch": 5.261437908496732, + "grad_norm": 0.859006941318512, + "learning_rate": 0.0002, + "loss": 0.9816, + "step": 8050 + }, + { + "epoch": 5.26797385620915, + "grad_norm": 1.0495043992996216, + "learning_rate": 0.0002, + "loss": 0.9445, + "step": 8060 + }, + { + "epoch": 5.2745098039215685, + "grad_norm": 1.329483151435852, + "learning_rate": 0.0002, + "loss": 0.9724, + "step": 8070 + }, + { + "epoch": 5.281045751633987, + "grad_norm": 1.1333061456680298, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 8080 + }, + { + "epoch": 5.287581699346405, + "grad_norm": 0.8153108358383179, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 8090 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 0.9395004510879517, + "learning_rate": 0.0002, + "loss": 0.9002, + "step": 8100 + }, + { + "epoch": 5.300653594771242, + "grad_norm": 0.8907593488693237, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 8110 + }, + { + "epoch": 5.30718954248366, + "grad_norm": 0.9808667898178101, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 8120 + }, + { + "epoch": 5.313725490196078, + "grad_norm": 0.984779417514801, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 8130 + }, + { + "epoch": 5.3202614379084965, + "grad_norm": 0.9787270426750183, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 8140 + }, + { + "epoch": 5.326797385620915, + "grad_norm": 0.9857710599899292, + "learning_rate": 0.0002, + "loss": 0.9336, + "step": 8150 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9774303436279297, + "learning_rate": 0.0002, + "loss": 0.9884, + "step": 8160 + }, + { + "epoch": 5.339869281045751, + "grad_norm": 0.677925169467926, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 8170 + }, + { + "epoch": 5.34640522875817, + "grad_norm": 0.9576456546783447, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 8180 + }, + { + "epoch": 5.352941176470588, + "grad_norm": 1.8970937728881836, + "learning_rate": 0.0002, + "loss": 0.9554, + "step": 8190 + }, + { + "epoch": 5.359477124183006, + "grad_norm": 0.9458389282226562, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 8200 + }, + { + "epoch": 5.366013071895424, + "grad_norm": 1.761794924736023, + "learning_rate": 0.0002, + "loss": 1.0365, + "step": 8210 + }, + { + "epoch": 5.372549019607844, + "grad_norm": 1.0693724155426025, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 8220 + }, + { + "epoch": 5.379084967320262, + "grad_norm": 0.9025877714157104, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 8230 + }, + { + "epoch": 5.38562091503268, + "grad_norm": 1.258857250213623, + "learning_rate": 0.0002, + "loss": 0.9652, + "step": 8240 + }, + { + "epoch": 5.392156862745098, + "grad_norm": 1.084849238395691, + "learning_rate": 0.0002, + "loss": 0.9735, + "step": 8250 + }, + { + "epoch": 5.398692810457517, + "grad_norm": 0.9530340433120728, + "learning_rate": 0.0002, + "loss": 0.9999, + "step": 8260 + }, + { + "epoch": 5.405228758169935, + "grad_norm": 0.830240786075592, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 8270 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 1.5807015895843506, + "learning_rate": 0.0002, + "loss": 1.0332, + "step": 8280 + }, + { + "epoch": 5.4183006535947715, + "grad_norm": 0.9486905336380005, + "learning_rate": 0.0002, + "loss": 0.9146, + "step": 8290 + }, + { + "epoch": 5.42483660130719, + "grad_norm": 1.0415093898773193, + "learning_rate": 0.0002, + "loss": 1.0336, + "step": 8300 + }, + { + "epoch": 5.431372549019608, + "grad_norm": 1.0501102209091187, + "learning_rate": 0.0002, + "loss": 0.8933, + "step": 8310 + }, + { + "epoch": 5.437908496732026, + "grad_norm": 0.9751836061477661, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 8320 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.5529173612594604, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 8330 + }, + { + "epoch": 5.450980392156863, + "grad_norm": 0.8314350247383118, + "learning_rate": 0.0002, + "loss": 0.9814, + "step": 8340 + }, + { + "epoch": 5.457516339869281, + "grad_norm": 1.2555103302001953, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 8350 + }, + { + "epoch": 5.4640522875816995, + "grad_norm": 0.9408367872238159, + "learning_rate": 0.0002, + "loss": 1.0127, + "step": 8360 + }, + { + "epoch": 5.470588235294118, + "grad_norm": 0.9483312964439392, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 8370 + }, + { + "epoch": 5.477124183006536, + "grad_norm": 0.957905650138855, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8380 + }, + { + "epoch": 5.483660130718954, + "grad_norm": 1.4000147581100464, + "learning_rate": 0.0002, + "loss": 1.0985, + "step": 8390 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 1.7032461166381836, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 8400 + }, + { + "epoch": 5.496732026143791, + "grad_norm": 0.8978716731071472, + "learning_rate": 0.0002, + "loss": 0.9539, + "step": 8410 + }, + { + "epoch": 5.503267973856209, + "grad_norm": 0.8659300804138184, + "learning_rate": 0.0002, + "loss": 0.9544, + "step": 8420 + }, + { + "epoch": 5.509803921568627, + "grad_norm": 1.3629727363586426, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 8430 + }, + { + "epoch": 5.516339869281046, + "grad_norm": 1.2741984128952026, + "learning_rate": 0.0002, + "loss": 0.9696, + "step": 8440 + }, + { + "epoch": 5.522875816993464, + "grad_norm": 1.3867180347442627, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 8450 + }, + { + "epoch": 5.529411764705882, + "grad_norm": 1.0662001371383667, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 8460 + }, + { + "epoch": 5.5359477124183005, + "grad_norm": 1.7005380392074585, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 8470 + }, + { + "epoch": 5.542483660130719, + "grad_norm": 1.3730385303497314, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 8480 + }, + { + "epoch": 5.549019607843137, + "grad_norm": 1.7737441062927246, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 8490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.907487690448761, + "learning_rate": 0.0002, + "loss": 0.9729, + "step": 8500 + }, + { + "epoch": 5.562091503267974, + "grad_norm": 0.8882441520690918, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 8510 + }, + { + "epoch": 5.568627450980392, + "grad_norm": 0.8655388951301575, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 8520 + }, + { + "epoch": 5.57516339869281, + "grad_norm": 1.379992961883545, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 8530 + }, + { + "epoch": 5.5816993464052285, + "grad_norm": 1.0021201372146606, + "learning_rate": 0.0002, + "loss": 1.0174, + "step": 8540 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 1.2636926174163818, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 8550 + }, + { + "epoch": 5.594771241830065, + "grad_norm": 1.279025912284851, + "learning_rate": 0.0002, + "loss": 1.0243, + "step": 8560 + }, + { + "epoch": 5.601307189542483, + "grad_norm": 0.8885834217071533, + "learning_rate": 0.0002, + "loss": 0.9917, + "step": 8570 + }, + { + "epoch": 5.607843137254902, + "grad_norm": 1.1975032091140747, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 8580 + }, + { + "epoch": 5.61437908496732, + "grad_norm": 1.005470871925354, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8590 + }, + { + "epoch": 5.620915032679738, + "grad_norm": 1.104286551475525, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 8600 + }, + { + "epoch": 5.627450980392156, + "grad_norm": 1.435445785522461, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 8610 + }, + { + "epoch": 5.633986928104575, + "grad_norm": 1.0270172357559204, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 8620 + }, + { + "epoch": 5.640522875816993, + "grad_norm": 1.0929527282714844, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 8630 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 1.1061221361160278, + "learning_rate": 0.0002, + "loss": 0.9694, + "step": 8640 + }, + { + "epoch": 5.65359477124183, + "grad_norm": 0.9563149213790894, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8650 + }, + { + "epoch": 5.660130718954249, + "grad_norm": 1.0434954166412354, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 8660 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.3695117235183716, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 8670 + }, + { + "epoch": 5.673202614379085, + "grad_norm": 1.0540564060211182, + "learning_rate": 0.0002, + "loss": 0.9441, + "step": 8680 + }, + { + "epoch": 5.6797385620915035, + "grad_norm": 1.5942492485046387, + "learning_rate": 0.0002, + "loss": 0.9755, + "step": 8690 + }, + { + "epoch": 5.686274509803922, + "grad_norm": 0.9485495090484619, + "learning_rate": 0.0002, + "loss": 1.0071, + "step": 8700 + }, + { + "epoch": 5.69281045751634, + "grad_norm": 1.1483162641525269, + "learning_rate": 0.0002, + "loss": 0.9998, + "step": 8710 + }, + { + "epoch": 5.699346405228758, + "grad_norm": 0.9075471758842468, + "learning_rate": 0.0002, + "loss": 0.9578, + "step": 8720 + }, + { + "epoch": 5.705882352941177, + "grad_norm": 1.7908551692962646, + "learning_rate": 0.0002, + "loss": 0.9488, + "step": 8730 + }, + { + "epoch": 5.712418300653595, + "grad_norm": 0.8867162466049194, + "learning_rate": 0.0002, + "loss": 1.0163, + "step": 8740 + }, + { + "epoch": 5.718954248366013, + "grad_norm": 1.7165148258209229, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 8750 + }, + { + "epoch": 5.7254901960784315, + "grad_norm": 0.9529356956481934, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 8760 + }, + { + "epoch": 5.73202614379085, + "grad_norm": 1.01852548122406, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 8770 + }, + { + "epoch": 5.738562091503268, + "grad_norm": 0.9538423418998718, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 8780 + }, + { + "epoch": 5.745098039215686, + "grad_norm": 0.9007737636566162, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 8790 + }, + { + "epoch": 5.751633986928105, + "grad_norm": 0.9107874035835266, + "learning_rate": 0.0002, + "loss": 0.9766, + "step": 8800 + }, + { + "epoch": 5.758169934640523, + "grad_norm": 0.7379238605499268, + "learning_rate": 0.0002, + "loss": 0.9212, + "step": 8810 + }, + { + "epoch": 5.764705882352941, + "grad_norm": 1.072645902633667, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 8820 + }, + { + "epoch": 5.771241830065359, + "grad_norm": 1.002008080482483, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 8830 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 1.0435924530029297, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 8840 + }, + { + "epoch": 5.784313725490196, + "grad_norm": 0.9874551296234131, + "learning_rate": 0.0002, + "loss": 0.9458, + "step": 8850 + }, + { + "epoch": 5.790849673202614, + "grad_norm": 1.1729662418365479, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 8860 + }, + { + "epoch": 5.7973856209150325, + "grad_norm": 1.3300775289535522, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 8870 + }, + { + "epoch": 5.803921568627451, + "grad_norm": 1.612707257270813, + "learning_rate": 0.0002, + "loss": 1.0989, + "step": 8880 + }, + { + "epoch": 5.810457516339869, + "grad_norm": 0.9047797322273254, + "learning_rate": 0.0002, + "loss": 0.9119, + "step": 8890 + }, + { + "epoch": 5.816993464052287, + "grad_norm": 1.0958741903305054, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 8900 + }, + { + "epoch": 5.823529411764706, + "grad_norm": 1.0099612474441528, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 8910 + }, + { + "epoch": 5.830065359477124, + "grad_norm": 0.8442328572273254, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 8920 + }, + { + "epoch": 5.836601307189542, + "grad_norm": 1.1388301849365234, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 8930 + }, + { + "epoch": 5.8431372549019605, + "grad_norm": 0.8296026587486267, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 8940 + }, + { + "epoch": 5.849673202614379, + "grad_norm": 1.0843533277511597, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 8950 + }, + { + "epoch": 5.856209150326797, + "grad_norm": 0.8496834635734558, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 8960 + }, + { + "epoch": 5.862745098039216, + "grad_norm": 1.6894690990447998, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 8970 + }, + { + "epoch": 5.8692810457516345, + "grad_norm": 1.0012282133102417, + "learning_rate": 0.0002, + "loss": 1.0939, + "step": 8980 + }, + { + "epoch": 5.875816993464053, + "grad_norm": 0.8521103262901306, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 8990 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 1.246841311454773, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 9000 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9941892027854919, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 9010 + }, + { + "epoch": 5.895424836601308, + "grad_norm": 1.067413568496704, + "learning_rate": 0.0002, + "loss": 0.8754, + "step": 9020 + }, + { + "epoch": 5.901960784313726, + "grad_norm": 1.0045088529586792, + "learning_rate": 0.0002, + "loss": 1.0153, + "step": 9030 + }, + { + "epoch": 5.908496732026144, + "grad_norm": 1.383063554763794, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 9040 + }, + { + "epoch": 5.915032679738562, + "grad_norm": 0.8754428625106812, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 9050 + }, + { + "epoch": 5.921568627450981, + "grad_norm": 0.8577388525009155, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 9060 + }, + { + "epoch": 5.928104575163399, + "grad_norm": 0.8718975186347961, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 9070 + }, + { + "epoch": 5.934640522875817, + "grad_norm": 1.1762131452560425, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 9080 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 1.1025866270065308, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 9090 + }, + { + "epoch": 5.947712418300654, + "grad_norm": 1.0439870357513428, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 9100 + }, + { + "epoch": 5.954248366013072, + "grad_norm": 1.2411525249481201, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 9110 + }, + { + "epoch": 5.96078431372549, + "grad_norm": 1.0317714214324951, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 9120 + }, + { + "epoch": 5.967320261437909, + "grad_norm": 0.9880492091178894, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 9130 + }, + { + "epoch": 5.973856209150327, + "grad_norm": 0.9039815664291382, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 9140 + }, + { + "epoch": 5.980392156862745, + "grad_norm": 0.9049116373062134, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 9150 + }, + { + "epoch": 5.9869281045751634, + "grad_norm": 0.996749222278595, + "learning_rate": 0.0002, + "loss": 0.9792, + "step": 9160 + }, + { + "epoch": 5.993464052287582, + "grad_norm": 0.8716062307357788, + "learning_rate": 0.0002, + "loss": 0.8857, + "step": 9170 + }, + { + "epoch": 6.0, + "grad_norm": 1.3081294298171997, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 9180 + }, + { + "epoch": 6.0, + "eval_loss": 1.45111083984375, + "eval_runtime": 34.7121, + "eval_samples_per_second": 12.56, + "eval_steps_per_second": 1.584, + "step": 9180 + } + ], + "logging_steps": 10, + "max_steps": 12240, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1219792635625472e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35e722282419bcef977427e4d3675fe3b94ec688 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc25f531ee37172f22a819ab79094fe89aae41504e4c8b696743b5e23d9e7641 +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_log.jsonl b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a6dd0a9b7b5310236d26ee5defe5aabea697f6b --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 1530, "epoch_duration": 1647.488445520401, "total_accumulated_duration": 1647.488445520401, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 5628.7490234375}, "avg_memory_reserved": {"GPU_0": 6182.0}, "peak_memory_reserved": {"GPU_0": 6182.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}]} +{"epoch": 2.0, "step": 3060, "epoch_duration": 1650.3675973415375, "total_accumulated_duration": 3297.8560428619385, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-1530", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}]} +{"epoch": 3.0, "step": 4590, "epoch_duration": 1615.1267035007477, "total_accumulated_duration": 4912.982746362686, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-3060", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}]} +{"epoch": 4.0, "step": 6120, "epoch_duration": 1762.4439125061035, "total_accumulated_duration": 6675.42665886879, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-4590", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}, {"eval_loss": 1.4131312370300293, "eval_runtime": 31.8967, "eval_samples_per_second": 13.669, "eval_steps_per_second": 1.724, "epoch": 3.0, "step": 4590}, {"loss": 1.1283, "grad_norm": 0.6926420331001282, "learning_rate": 0.0002, "epoch": 3.0065359477124183, "step": 4600}, {"loss": 1.1537, "grad_norm": 0.6656355857849121, "learning_rate": 0.0002, "epoch": 3.0130718954248366, "step": 4610}, {"loss": 1.308, "grad_norm": 0.9901936650276184, "learning_rate": 0.0002, "epoch": 3.019607843137255, "step": 4620}, {"loss": 1.22, "grad_norm": 0.6713474988937378, "learning_rate": 0.0002, "epoch": 3.026143790849673, "step": 4630}, {"loss": 1.2249, "grad_norm": 0.6199324131011963, "learning_rate": 0.0002, "epoch": 3.0326797385620914, "step": 4640}, {"loss": 1.242, "grad_norm": 0.7180785536766052, "learning_rate": 0.0002, "epoch": 3.0392156862745097, "step": 4650}, {"loss": 1.1349, "grad_norm": 0.8256588578224182, "learning_rate": 0.0002, "epoch": 3.045751633986928, "step": 4660}, {"loss": 1.1431, "grad_norm": 0.6637389063835144, "learning_rate": 0.0002, "epoch": 3.052287581699346, "step": 4670}, {"loss": 1.1096, "grad_norm": 0.6980698108673096, "learning_rate": 0.0002, "epoch": 3.0588235294117645, "step": 4680}, {"loss": 1.196, "grad_norm": 0.8091534972190857, "learning_rate": 0.0002, "epoch": 3.065359477124183, "step": 4690}, {"loss": 1.1652, "grad_norm": 0.5715174078941345, "learning_rate": 0.0002, "epoch": 3.0718954248366015, "step": 4700}, {"loss": 1.1427, "grad_norm": 0.735639750957489, "learning_rate": 0.0002, "epoch": 3.0784313725490198, "step": 4710}, {"loss": 1.1522, "grad_norm": 0.7619708180427551, "learning_rate": 0.0002, "epoch": 3.084967320261438, "step": 4720}, {"loss": 1.0853, "grad_norm": 1.263566017150879, "learning_rate": 0.0002, "epoch": 3.0915032679738563, "step": 4730}, {"loss": 1.1348, "grad_norm": 0.6600871682167053, "learning_rate": 0.0002, "epoch": 3.0980392156862746, "step": 4740}, {"loss": 1.1766, "grad_norm": 0.717792809009552, "learning_rate": 0.0002, "epoch": 3.104575163398693, "step": 4750}, {"loss": 1.088, "grad_norm": 0.853714644908905, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 4760}, {"loss": 1.2031, "grad_norm": 1.1004153490066528, "learning_rate": 0.0002, "epoch": 3.1176470588235294, "step": 4770}, {"loss": 1.3295, "grad_norm": 0.8566235899925232, "learning_rate": 0.0002, "epoch": 3.1241830065359477, "step": 4780}, {"loss": 1.2436, "grad_norm": 0.8315296173095703, "learning_rate": 0.0002, "epoch": 3.130718954248366, "step": 4790}, {"loss": 1.32, "grad_norm": 0.8020524978637695, "learning_rate": 0.0002, "epoch": 3.1372549019607843, "step": 4800}, {"loss": 1.1238, "grad_norm": 0.7564275860786438, "learning_rate": 0.0002, "epoch": 3.1437908496732025, "step": 4810}, {"loss": 1.1244, "grad_norm": 0.9077776670455933, "learning_rate": 0.0002, "epoch": 3.150326797385621, "step": 4820}, {"loss": 1.1399, "grad_norm": 0.6323099732398987, "learning_rate": 0.0002, "epoch": 3.156862745098039, "step": 4830}, {"loss": 1.1983, "grad_norm": 0.6625368595123291, "learning_rate": 0.0002, "epoch": 3.1633986928104574, "step": 4840}, {"loss": 1.066, "grad_norm": 0.8119261860847473, "learning_rate": 0.0002, "epoch": 3.1699346405228757, "step": 4850}, {"loss": 1.0224, "grad_norm": 0.6399450898170471, "learning_rate": 0.0002, "epoch": 3.176470588235294, "step": 4860}, {"loss": 1.2181, "grad_norm": 1.0659016370773315, "learning_rate": 0.0002, "epoch": 3.183006535947712, "step": 4870}, {"loss": 1.2914, "grad_norm": 0.8040369749069214, "learning_rate": 0.0002, "epoch": 3.189542483660131, "step": 4880}, {"loss": 1.1996, "grad_norm": 0.7784733176231384, "learning_rate": 0.0002, "epoch": 3.196078431372549, "step": 4890}, {"loss": 1.2051, "grad_norm": 0.9660294651985168, "learning_rate": 0.0002, "epoch": 3.2026143790849675, "step": 4900}, {"loss": 1.0419, "grad_norm": 1.0676977634429932, "learning_rate": 0.0002, "epoch": 3.2091503267973858, "step": 4910}, {"loss": 1.0083, "grad_norm": 0.5877565741539001, "learning_rate": 0.0002, "epoch": 3.215686274509804, "step": 4920}, {"loss": 1.1046, "grad_norm": 0.6164032816886902, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 4930}, {"loss": 1.1079, "grad_norm": 0.7627606987953186, "learning_rate": 0.0002, "epoch": 3.2287581699346406, "step": 4940}, {"loss": 1.2453, "grad_norm": 0.7442803978919983, "learning_rate": 0.0002, "epoch": 3.235294117647059, "step": 4950}, {"loss": 1.1087, "grad_norm": 0.7277812361717224, "learning_rate": 0.0002, "epoch": 3.241830065359477, "step": 4960}, {"loss": 1.2237, "grad_norm": 1.0301902294158936, "learning_rate": 0.0002, "epoch": 3.2483660130718954, "step": 4970}, {"loss": 1.1466, "grad_norm": 0.7798232436180115, "learning_rate": 0.0002, "epoch": 3.2549019607843137, "step": 4980}, {"loss": 1.2142, "grad_norm": 1.210265874862671, "learning_rate": 0.0002, "epoch": 3.261437908496732, "step": 4990}, {"loss": 1.1557, "grad_norm": 0.6677713990211487, "learning_rate": 0.0002, "epoch": 3.2679738562091503, "step": 5000}, {"loss": 1.3294, "grad_norm": 1.0524500608444214, "learning_rate": 0.0002, "epoch": 3.2745098039215685, "step": 5010}, {"loss": 1.1939, "grad_norm": 0.7091745734214783, "learning_rate": 0.0002, "epoch": 3.281045751633987, "step": 5020}, {"loss": 1.1891, "grad_norm": 0.8523224592208862, "learning_rate": 0.0002, "epoch": 3.287581699346405, "step": 5030}, {"loss": 1.1925, "grad_norm": 0.6120608448982239, "learning_rate": 0.0002, "epoch": 3.2941176470588234, "step": 5040}, {"loss": 1.0603, "grad_norm": 0.7437472939491272, "learning_rate": 0.0002, "epoch": 3.3006535947712417, "step": 5050}, {"loss": 1.1295, "grad_norm": 0.7611715197563171, "learning_rate": 0.0002, "epoch": 3.30718954248366, "step": 5060}, {"loss": 1.0531, "grad_norm": 0.7249704003334045, "learning_rate": 0.0002, "epoch": 3.313725490196078, "step": 5070}, {"loss": 1.2292, "grad_norm": 0.7316247820854187, "learning_rate": 0.0002, "epoch": 3.3202614379084965, "step": 5080}, {"loss": 1.1974, "grad_norm": 0.562412440776825, "learning_rate": 0.0002, "epoch": 3.326797385620915, "step": 5090}, {"loss": 1.0736, "grad_norm": 0.7052176594734192, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 5100}, {"loss": 1.122, "grad_norm": 0.7714211344718933, "learning_rate": 0.0002, "epoch": 3.3398692810457518, "step": 5110}, {"loss": 1.1684, "grad_norm": 1.0436055660247803, "learning_rate": 0.0002, "epoch": 3.34640522875817, "step": 5120}, {"loss": 1.0945, "grad_norm": 0.8867271542549133, "learning_rate": 0.0002, "epoch": 3.3529411764705883, "step": 5130}, {"loss": 1.159, "grad_norm": 0.8371267914772034, "learning_rate": 0.0002, "epoch": 3.3594771241830066, "step": 5140}, {"loss": 1.1073, "grad_norm": 0.7257837057113647, "learning_rate": 0.0002, "epoch": 3.366013071895425, "step": 5150}, {"loss": 1.1162, "grad_norm": 0.7102002501487732, "learning_rate": 0.0002, "epoch": 3.372549019607843, "step": 5160}, {"loss": 1.2056, "grad_norm": 0.7636350393295288, "learning_rate": 0.0002, "epoch": 3.3790849673202614, "step": 5170}, {"loss": 1.0708, "grad_norm": 0.6887359619140625, "learning_rate": 0.0002, "epoch": 3.3856209150326797, "step": 5180}, {"loss": 1.3807, "grad_norm": 0.8141424655914307, "learning_rate": 0.0002, "epoch": 3.392156862745098, "step": 5190}, {"loss": 1.1986, "grad_norm": 0.694423496723175, "learning_rate": 0.0002, "epoch": 3.3986928104575163, "step": 5200}, {"loss": 1.2945, "grad_norm": 0.914013683795929, "learning_rate": 0.0002, "epoch": 3.4052287581699345, "step": 5210}, {"loss": 1.1413, "grad_norm": 0.8503239750862122, "learning_rate": 0.0002, "epoch": 3.411764705882353, "step": 5220}, {"loss": 1.2696, "grad_norm": 0.6196836233139038, "learning_rate": 0.0002, "epoch": 3.418300653594771, "step": 5230}, {"loss": 1.2431, "grad_norm": 1.0760811567306519, "learning_rate": 0.0002, "epoch": 3.4248366013071894, "step": 5240}, {"loss": 1.1686, "grad_norm": 0.6524698138237, "learning_rate": 0.0002, "epoch": 3.431372549019608, "step": 5250}, {"loss": 1.2012, "grad_norm": 0.674467921257019, "learning_rate": 0.0002, "epoch": 3.4379084967320264, "step": 5260}, {"loss": 1.1015, "grad_norm": 0.7690372467041016, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 5270}, {"loss": 1.2511, "grad_norm": 0.8751813769340515, "learning_rate": 0.0002, "epoch": 3.450980392156863, "step": 5280}, {"loss": 1.1841, "grad_norm": 0.750407874584198, "learning_rate": 0.0002, "epoch": 3.457516339869281, "step": 5290}, {"loss": 1.0605, "grad_norm": 0.5991823077201843, "learning_rate": 0.0002, "epoch": 3.4640522875816995, "step": 5300}, {"loss": 1.2347, "grad_norm": 1.0164772272109985, "learning_rate": 0.0002, "epoch": 3.4705882352941178, "step": 5310}, {"loss": 1.2354, "grad_norm": 0.8704105019569397, "learning_rate": 0.0002, "epoch": 3.477124183006536, "step": 5320}, {"loss": 1.2169, "grad_norm": 0.709102213382721, "learning_rate": 0.0002, "epoch": 3.4836601307189543, "step": 5330}, {"loss": 1.2425, "grad_norm": 0.6273632049560547, "learning_rate": 0.0002, "epoch": 3.4901960784313726, "step": 5340}, {"loss": 1.1585, "grad_norm": 0.6807359457015991, "learning_rate": 0.0002, "epoch": 3.496732026143791, "step": 5350}, {"loss": 1.131, "grad_norm": 0.7085188627243042, "learning_rate": 0.0002, "epoch": 3.503267973856209, "step": 5360}, {"loss": 1.1159, "grad_norm": 0.6938307881355286, "learning_rate": 0.0002, "epoch": 3.5098039215686274, "step": 5370}, {"loss": 1.1397, "grad_norm": 0.8544146418571472, "learning_rate": 0.0002, "epoch": 3.5163398692810457, "step": 5380}, {"loss": 1.2181, "grad_norm": 0.7889642119407654, "learning_rate": 0.0002, "epoch": 3.522875816993464, "step": 5390}, {"loss": 1.1691, "grad_norm": 0.7858421206474304, "learning_rate": 0.0002, "epoch": 3.5294117647058822, "step": 5400}, {"loss": 1.2374, "grad_norm": 0.8547123074531555, "learning_rate": 0.0002, "epoch": 3.5359477124183005, "step": 5410}, {"loss": 1.196, "grad_norm": 0.8218181133270264, "learning_rate": 0.0002, "epoch": 3.542483660130719, "step": 5420}, {"loss": 1.1961, "grad_norm": 1.153623342514038, "learning_rate": 0.0002, "epoch": 3.549019607843137, "step": 5430}, {"loss": 1.156, "grad_norm": 1.1321099996566772, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 5440}, {"loss": 1.2224, "grad_norm": 0.9495334029197693, "learning_rate": 0.0002, "epoch": 3.5620915032679736, "step": 5450}, {"loss": 1.2869, "grad_norm": 0.8743821978569031, "learning_rate": 0.0002, "epoch": 3.568627450980392, "step": 5460}, {"loss": 1.1018, "grad_norm": 0.7513086795806885, "learning_rate": 0.0002, "epoch": 3.57516339869281, "step": 5470}, {"loss": 1.1082, "grad_norm": 1.0139480829238892, "learning_rate": 0.0002, "epoch": 3.581699346405229, "step": 5480}, {"loss": 1.1706, "grad_norm": 0.6615135073661804, "learning_rate": 0.0002, "epoch": 3.588235294117647, "step": 5490}, {"loss": 1.3906, "grad_norm": 1.180798888206482, "learning_rate": 0.0002, "epoch": 3.5947712418300655, "step": 5500}, {"loss": 1.2391, "grad_norm": 0.7085279226303101, "learning_rate": 0.0002, "epoch": 3.6013071895424837, "step": 5510}, {"loss": 1.1623, "grad_norm": 0.540268063545227, "learning_rate": 0.0002, "epoch": 3.607843137254902, "step": 5520}, {"loss": 1.2132, "grad_norm": 0.7905671000480652, "learning_rate": 0.0002, "epoch": 3.6143790849673203, "step": 5530}, {"loss": 1.2731, "grad_norm": 0.8457717299461365, "learning_rate": 0.0002, "epoch": 3.6209150326797386, "step": 5540}, {"loss": 1.1799, "grad_norm": 0.7102677822113037, "learning_rate": 0.0002, "epoch": 3.627450980392157, "step": 5550}, {"loss": 1.2394, "grad_norm": 0.7179514765739441, "learning_rate": 0.0002, "epoch": 3.633986928104575, "step": 5560}, {"loss": 1.2019, "grad_norm": 1.0854148864746094, "learning_rate": 0.0002, "epoch": 3.6405228758169934, "step": 5570}, {"loss": 1.1986, "grad_norm": 0.8209951519966125, "learning_rate": 0.0002, "epoch": 3.6470588235294117, "step": 5580}, {"loss": 1.2289, "grad_norm": 0.6944138407707214, "learning_rate": 0.0002, "epoch": 3.65359477124183, "step": 5590}, {"loss": 1.3226, "grad_norm": 0.7675473093986511, "learning_rate": 0.0002, "epoch": 3.6601307189542482, "step": 5600}, {"loss": 1.2866, "grad_norm": 0.6683364510536194, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 5610}, {"loss": 1.1099, "grad_norm": 0.7920727133750916, "learning_rate": 0.0002, "epoch": 3.6732026143790852, "step": 5620}, {"loss": 1.2287, "grad_norm": 0.9440218806266785, "learning_rate": 0.0002, "epoch": 3.6797385620915035, "step": 5630}, {"loss": 1.2444, "grad_norm": 0.6600824594497681, "learning_rate": 0.0002, "epoch": 3.686274509803922, "step": 5640}, {"loss": 1.191, "grad_norm": 0.6860619187355042, "learning_rate": 0.0002, "epoch": 3.69281045751634, "step": 5650}, {"loss": 1.1914, "grad_norm": 0.6579713225364685, "learning_rate": 0.0002, "epoch": 3.6993464052287583, "step": 5660}, {"loss": 1.1464, "grad_norm": 0.661081075668335, "learning_rate": 0.0002, "epoch": 3.7058823529411766, "step": 5670}, {"loss": 1.289, "grad_norm": 1.0968825817108154, "learning_rate": 0.0002, "epoch": 3.712418300653595, "step": 5680}, {"loss": 1.192, "grad_norm": 0.8066844940185547, "learning_rate": 0.0002, "epoch": 3.718954248366013, "step": 5690}, {"loss": 1.2322, "grad_norm": 0.8341682553291321, "learning_rate": 0.0002, "epoch": 3.7254901960784315, "step": 5700}, {"loss": 1.1473, "grad_norm": 0.6682852506637573, "learning_rate": 0.0002, "epoch": 3.7320261437908497, "step": 5710}, {"loss": 1.1566, "grad_norm": 0.898595929145813, "learning_rate": 0.0002, "epoch": 3.738562091503268, "step": 5720}, {"loss": 1.0919, "grad_norm": 0.6876054406166077, "learning_rate": 0.0002, "epoch": 3.7450980392156863, "step": 5730}, {"loss": 1.2302, "grad_norm": 0.7817103266716003, "learning_rate": 0.0002, "epoch": 3.7516339869281046, "step": 5740}, {"loss": 1.2439, "grad_norm": 0.5840168595314026, "learning_rate": 0.0002, "epoch": 3.758169934640523, "step": 5750}, {"loss": 1.1279, "grad_norm": 0.6263918876647949, "learning_rate": 0.0002, "epoch": 3.764705882352941, "step": 5760}, {"loss": 1.2023, "grad_norm": 0.7948952317237854, "learning_rate": 0.0002, "epoch": 3.7712418300653594, "step": 5770}, {"loss": 1.149, "grad_norm": 0.6700998544692993, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 5780}, {"loss": 1.3207, "grad_norm": 1.1169519424438477, "learning_rate": 0.0002, "epoch": 3.784313725490196, "step": 5790}, {"loss": 1.064, "grad_norm": 0.8354471325874329, "learning_rate": 0.0002, "epoch": 3.7908496732026142, "step": 5800}, {"loss": 1.2104, "grad_norm": 0.6304181814193726, "learning_rate": 0.0002, "epoch": 3.7973856209150325, "step": 5810}, {"loss": 1.2059, "grad_norm": 0.6919655799865723, "learning_rate": 0.0002, "epoch": 3.803921568627451, "step": 5820}, {"loss": 1.217, "grad_norm": 0.600385844707489, "learning_rate": 0.0002, "epoch": 3.810457516339869, "step": 5830}, {"loss": 1.2324, "grad_norm": 0.8406319618225098, "learning_rate": 0.0002, "epoch": 3.8169934640522873, "step": 5840}, {"loss": 1.2418, "grad_norm": 0.7594282031059265, "learning_rate": 0.0002, "epoch": 3.8235294117647056, "step": 5850}, {"loss": 1.1903, "grad_norm": 0.8179879784584045, "learning_rate": 0.0002, "epoch": 3.8300653594771243, "step": 5860}, {"loss": 1.255, "grad_norm": 1.141430377960205, "learning_rate": 0.0002, "epoch": 3.8366013071895426, "step": 5870}, {"loss": 1.1467, "grad_norm": 0.6595550775527954, "learning_rate": 0.0002, "epoch": 3.843137254901961, "step": 5880}, {"loss": 1.2378, "grad_norm": 0.7499435544013977, "learning_rate": 0.0002, "epoch": 3.849673202614379, "step": 5890}, {"loss": 1.217, "grad_norm": 0.7851517200469971, "learning_rate": 0.0002, "epoch": 3.8562091503267975, "step": 5900}, {"loss": 1.162, "grad_norm": 1.0533545017242432, "learning_rate": 0.0002, "epoch": 3.8627450980392157, "step": 5910}, {"loss": 1.3576, "grad_norm": 0.960086464881897, "learning_rate": 0.0002, "epoch": 3.869281045751634, "step": 5920}, {"loss": 1.151, "grad_norm": 0.9952049851417542, "learning_rate": 0.0002, "epoch": 3.8758169934640523, "step": 5930}, {"loss": 1.2027, "grad_norm": 0.7884191274642944, "learning_rate": 0.0002, "epoch": 3.8823529411764706, "step": 5940}, {"loss": 1.1796, "grad_norm": 0.7461766600608826, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 5950}, {"loss": 1.2251, "grad_norm": 0.9594355821609497, "learning_rate": 0.0002, "epoch": 3.895424836601307, "step": 5960}, {"loss": 1.1164, "grad_norm": 0.8179471492767334, "learning_rate": 0.0002, "epoch": 3.9019607843137254, "step": 5970}, {"loss": 1.2421, "grad_norm": 0.8240267634391785, "learning_rate": 0.0002, "epoch": 3.9084967320261437, "step": 5980}, {"loss": 1.3076, "grad_norm": 0.7462618350982666, "learning_rate": 0.0002, "epoch": 3.915032679738562, "step": 5990}, {"loss": 1.2124, "grad_norm": 0.711207389831543, "learning_rate": 0.0002, "epoch": 3.9215686274509802, "step": 6000}, {"loss": 1.2119, "grad_norm": 0.6910956501960754, "learning_rate": 0.0002, "epoch": 3.928104575163399, "step": 6010}, {"loss": 1.2127, "grad_norm": 0.749093770980835, "learning_rate": 0.0002, "epoch": 3.9346405228758172, "step": 6020}, {"loss": 1.1542, "grad_norm": 1.3332762718200684, "learning_rate": 0.0002, "epoch": 3.9411764705882355, "step": 6030}, {"loss": 1.1442, "grad_norm": 0.71457439661026, "learning_rate": 0.0002, "epoch": 3.947712418300654, "step": 6040}, {"loss": 1.339, "grad_norm": 1.1205238103866577, "learning_rate": 0.0002, "epoch": 3.954248366013072, "step": 6050}, {"loss": 1.2962, "grad_norm": 0.6958928108215332, "learning_rate": 0.0002, "epoch": 3.9607843137254903, "step": 6060}, {"loss": 1.1802, "grad_norm": 0.7518056035041809, "learning_rate": 0.0002, "epoch": 3.9673202614379086, "step": 6070}, {"loss": 1.1179, "grad_norm": 0.8010755777359009, "learning_rate": 0.0002, "epoch": 3.973856209150327, "step": 6080}, {"loss": 1.2867, "grad_norm": 0.7492658495903015, "learning_rate": 0.0002, "epoch": 3.980392156862745, "step": 6090}, {"loss": 1.2113, "grad_norm": 0.900704562664032, "learning_rate": 0.0002, "epoch": 3.9869281045751634, "step": 6100}, {"loss": 1.1106, "grad_norm": 0.7997331619262695, "learning_rate": 0.0002, "epoch": 3.9934640522875817, "step": 6110}, {"loss": 1.1244, "grad_norm": 0.7163209319114685, "learning_rate": 0.0002, "epoch": 4.0, "step": 6120}]} +{"epoch": 5.0, "step": 7650, "epoch_duration": 1809.9564287662506, "total_accumulated_duration": 8485.38308763504, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}, {"eval_loss": 1.4131312370300293, "eval_runtime": 31.8967, "eval_samples_per_second": 13.669, "eval_steps_per_second": 1.724, "epoch": 3.0, "step": 4590}, {"loss": 1.1283, "grad_norm": 0.6926420331001282, "learning_rate": 0.0002, "epoch": 3.0065359477124183, "step": 4600}, {"loss": 1.1537, "grad_norm": 0.6656355857849121, "learning_rate": 0.0002, "epoch": 3.0130718954248366, "step": 4610}, {"loss": 1.308, "grad_norm": 0.9901936650276184, "learning_rate": 0.0002, "epoch": 3.019607843137255, "step": 4620}, {"loss": 1.22, "grad_norm": 0.6713474988937378, "learning_rate": 0.0002, "epoch": 3.026143790849673, "step": 4630}, {"loss": 1.2249, "grad_norm": 0.6199324131011963, "learning_rate": 0.0002, "epoch": 3.0326797385620914, "step": 4640}, {"loss": 1.242, "grad_norm": 0.7180785536766052, "learning_rate": 0.0002, "epoch": 3.0392156862745097, "step": 4650}, {"loss": 1.1349, "grad_norm": 0.8256588578224182, "learning_rate": 0.0002, "epoch": 3.045751633986928, "step": 4660}, {"loss": 1.1431, "grad_norm": 0.6637389063835144, "learning_rate": 0.0002, "epoch": 3.052287581699346, "step": 4670}, {"loss": 1.1096, "grad_norm": 0.6980698108673096, "learning_rate": 0.0002, "epoch": 3.0588235294117645, "step": 4680}, {"loss": 1.196, "grad_norm": 0.8091534972190857, "learning_rate": 0.0002, "epoch": 3.065359477124183, "step": 4690}, {"loss": 1.1652, "grad_norm": 0.5715174078941345, "learning_rate": 0.0002, "epoch": 3.0718954248366015, "step": 4700}, {"loss": 1.1427, "grad_norm": 0.735639750957489, "learning_rate": 0.0002, "epoch": 3.0784313725490198, "step": 4710}, {"loss": 1.1522, "grad_norm": 0.7619708180427551, "learning_rate": 0.0002, "epoch": 3.084967320261438, "step": 4720}, {"loss": 1.0853, "grad_norm": 1.263566017150879, "learning_rate": 0.0002, "epoch": 3.0915032679738563, "step": 4730}, {"loss": 1.1348, "grad_norm": 0.6600871682167053, "learning_rate": 0.0002, "epoch": 3.0980392156862746, "step": 4740}, {"loss": 1.1766, "grad_norm": 0.717792809009552, "learning_rate": 0.0002, "epoch": 3.104575163398693, "step": 4750}, {"loss": 1.088, "grad_norm": 0.853714644908905, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 4760}, {"loss": 1.2031, "grad_norm": 1.1004153490066528, "learning_rate": 0.0002, "epoch": 3.1176470588235294, "step": 4770}, {"loss": 1.3295, "grad_norm": 0.8566235899925232, "learning_rate": 0.0002, "epoch": 3.1241830065359477, "step": 4780}, {"loss": 1.2436, "grad_norm": 0.8315296173095703, "learning_rate": 0.0002, "epoch": 3.130718954248366, "step": 4790}, {"loss": 1.32, "grad_norm": 0.8020524978637695, "learning_rate": 0.0002, "epoch": 3.1372549019607843, "step": 4800}, {"loss": 1.1238, "grad_norm": 0.7564275860786438, "learning_rate": 0.0002, "epoch": 3.1437908496732025, "step": 4810}, {"loss": 1.1244, "grad_norm": 0.9077776670455933, "learning_rate": 0.0002, "epoch": 3.150326797385621, "step": 4820}, {"loss": 1.1399, "grad_norm": 0.6323099732398987, "learning_rate": 0.0002, "epoch": 3.156862745098039, "step": 4830}, {"loss": 1.1983, "grad_norm": 0.6625368595123291, "learning_rate": 0.0002, "epoch": 3.1633986928104574, "step": 4840}, {"loss": 1.066, "grad_norm": 0.8119261860847473, "learning_rate": 0.0002, "epoch": 3.1699346405228757, "step": 4850}, {"loss": 1.0224, "grad_norm": 0.6399450898170471, "learning_rate": 0.0002, "epoch": 3.176470588235294, "step": 4860}, {"loss": 1.2181, "grad_norm": 1.0659016370773315, "learning_rate": 0.0002, "epoch": 3.183006535947712, "step": 4870}, {"loss": 1.2914, "grad_norm": 0.8040369749069214, "learning_rate": 0.0002, "epoch": 3.189542483660131, "step": 4880}, {"loss": 1.1996, "grad_norm": 0.7784733176231384, "learning_rate": 0.0002, "epoch": 3.196078431372549, "step": 4890}, {"loss": 1.2051, "grad_norm": 0.9660294651985168, "learning_rate": 0.0002, "epoch": 3.2026143790849675, "step": 4900}, {"loss": 1.0419, "grad_norm": 1.0676977634429932, "learning_rate": 0.0002, "epoch": 3.2091503267973858, "step": 4910}, {"loss": 1.0083, "grad_norm": 0.5877565741539001, "learning_rate": 0.0002, "epoch": 3.215686274509804, "step": 4920}, {"loss": 1.1046, "grad_norm": 0.6164032816886902, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 4930}, {"loss": 1.1079, "grad_norm": 0.7627606987953186, "learning_rate": 0.0002, "epoch": 3.2287581699346406, "step": 4940}, {"loss": 1.2453, "grad_norm": 0.7442803978919983, "learning_rate": 0.0002, "epoch": 3.235294117647059, "step": 4950}, {"loss": 1.1087, "grad_norm": 0.7277812361717224, "learning_rate": 0.0002, "epoch": 3.241830065359477, "step": 4960}, {"loss": 1.2237, "grad_norm": 1.0301902294158936, "learning_rate": 0.0002, "epoch": 3.2483660130718954, "step": 4970}, {"loss": 1.1466, "grad_norm": 0.7798232436180115, "learning_rate": 0.0002, "epoch": 3.2549019607843137, "step": 4980}, {"loss": 1.2142, "grad_norm": 1.210265874862671, "learning_rate": 0.0002, "epoch": 3.261437908496732, "step": 4990}, {"loss": 1.1557, "grad_norm": 0.6677713990211487, "learning_rate": 0.0002, "epoch": 3.2679738562091503, "step": 5000}, {"loss": 1.3294, "grad_norm": 1.0524500608444214, "learning_rate": 0.0002, "epoch": 3.2745098039215685, "step": 5010}, {"loss": 1.1939, "grad_norm": 0.7091745734214783, "learning_rate": 0.0002, "epoch": 3.281045751633987, "step": 5020}, {"loss": 1.1891, "grad_norm": 0.8523224592208862, "learning_rate": 0.0002, "epoch": 3.287581699346405, "step": 5030}, {"loss": 1.1925, "grad_norm": 0.6120608448982239, "learning_rate": 0.0002, "epoch": 3.2941176470588234, "step": 5040}, {"loss": 1.0603, "grad_norm": 0.7437472939491272, "learning_rate": 0.0002, "epoch": 3.3006535947712417, "step": 5050}, {"loss": 1.1295, "grad_norm": 0.7611715197563171, "learning_rate": 0.0002, "epoch": 3.30718954248366, "step": 5060}, {"loss": 1.0531, "grad_norm": 0.7249704003334045, "learning_rate": 0.0002, "epoch": 3.313725490196078, "step": 5070}, {"loss": 1.2292, "grad_norm": 0.7316247820854187, "learning_rate": 0.0002, "epoch": 3.3202614379084965, "step": 5080}, {"loss": 1.1974, "grad_norm": 0.562412440776825, "learning_rate": 0.0002, "epoch": 3.326797385620915, "step": 5090}, {"loss": 1.0736, "grad_norm": 0.7052176594734192, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 5100}, {"loss": 1.122, "grad_norm": 0.7714211344718933, "learning_rate": 0.0002, "epoch": 3.3398692810457518, "step": 5110}, {"loss": 1.1684, "grad_norm": 1.0436055660247803, "learning_rate": 0.0002, "epoch": 3.34640522875817, "step": 5120}, {"loss": 1.0945, "grad_norm": 0.8867271542549133, "learning_rate": 0.0002, "epoch": 3.3529411764705883, "step": 5130}, {"loss": 1.159, "grad_norm": 0.8371267914772034, "learning_rate": 0.0002, "epoch": 3.3594771241830066, "step": 5140}, {"loss": 1.1073, "grad_norm": 0.7257837057113647, "learning_rate": 0.0002, "epoch": 3.366013071895425, "step": 5150}, {"loss": 1.1162, "grad_norm": 0.7102002501487732, "learning_rate": 0.0002, "epoch": 3.372549019607843, "step": 5160}, {"loss": 1.2056, "grad_norm": 0.7636350393295288, "learning_rate": 0.0002, "epoch": 3.3790849673202614, "step": 5170}, {"loss": 1.0708, "grad_norm": 0.6887359619140625, "learning_rate": 0.0002, "epoch": 3.3856209150326797, "step": 5180}, {"loss": 1.3807, "grad_norm": 0.8141424655914307, "learning_rate": 0.0002, "epoch": 3.392156862745098, "step": 5190}, {"loss": 1.1986, "grad_norm": 0.694423496723175, "learning_rate": 0.0002, "epoch": 3.3986928104575163, "step": 5200}, {"loss": 1.2945, "grad_norm": 0.914013683795929, "learning_rate": 0.0002, "epoch": 3.4052287581699345, "step": 5210}, {"loss": 1.1413, "grad_norm": 0.8503239750862122, "learning_rate": 0.0002, "epoch": 3.411764705882353, "step": 5220}, {"loss": 1.2696, "grad_norm": 0.6196836233139038, "learning_rate": 0.0002, "epoch": 3.418300653594771, "step": 5230}, {"loss": 1.2431, "grad_norm": 1.0760811567306519, "learning_rate": 0.0002, "epoch": 3.4248366013071894, "step": 5240}, {"loss": 1.1686, "grad_norm": 0.6524698138237, "learning_rate": 0.0002, "epoch": 3.431372549019608, "step": 5250}, {"loss": 1.2012, "grad_norm": 0.674467921257019, "learning_rate": 0.0002, "epoch": 3.4379084967320264, "step": 5260}, {"loss": 1.1015, "grad_norm": 0.7690372467041016, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 5270}, {"loss": 1.2511, "grad_norm": 0.8751813769340515, "learning_rate": 0.0002, "epoch": 3.450980392156863, "step": 5280}, {"loss": 1.1841, "grad_norm": 0.750407874584198, "learning_rate": 0.0002, "epoch": 3.457516339869281, "step": 5290}, {"loss": 1.0605, "grad_norm": 0.5991823077201843, "learning_rate": 0.0002, "epoch": 3.4640522875816995, "step": 5300}, {"loss": 1.2347, "grad_norm": 1.0164772272109985, "learning_rate": 0.0002, "epoch": 3.4705882352941178, "step": 5310}, {"loss": 1.2354, "grad_norm": 0.8704105019569397, "learning_rate": 0.0002, "epoch": 3.477124183006536, "step": 5320}, {"loss": 1.2169, "grad_norm": 0.709102213382721, "learning_rate": 0.0002, "epoch": 3.4836601307189543, "step": 5330}, {"loss": 1.2425, "grad_norm": 0.6273632049560547, "learning_rate": 0.0002, "epoch": 3.4901960784313726, "step": 5340}, {"loss": 1.1585, "grad_norm": 0.6807359457015991, "learning_rate": 0.0002, "epoch": 3.496732026143791, "step": 5350}, {"loss": 1.131, "grad_norm": 0.7085188627243042, "learning_rate": 0.0002, "epoch": 3.503267973856209, "step": 5360}, {"loss": 1.1159, "grad_norm": 0.6938307881355286, "learning_rate": 0.0002, "epoch": 3.5098039215686274, "step": 5370}, {"loss": 1.1397, "grad_norm": 0.8544146418571472, "learning_rate": 0.0002, "epoch": 3.5163398692810457, "step": 5380}, {"loss": 1.2181, "grad_norm": 0.7889642119407654, "learning_rate": 0.0002, "epoch": 3.522875816993464, "step": 5390}, {"loss": 1.1691, "grad_norm": 0.7858421206474304, "learning_rate": 0.0002, "epoch": 3.5294117647058822, "step": 5400}, {"loss": 1.2374, "grad_norm": 0.8547123074531555, "learning_rate": 0.0002, "epoch": 3.5359477124183005, "step": 5410}, {"loss": 1.196, "grad_norm": 0.8218181133270264, "learning_rate": 0.0002, "epoch": 3.542483660130719, "step": 5420}, {"loss": 1.1961, "grad_norm": 1.153623342514038, "learning_rate": 0.0002, "epoch": 3.549019607843137, "step": 5430}, {"loss": 1.156, "grad_norm": 1.1321099996566772, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 5440}, {"loss": 1.2224, "grad_norm": 0.9495334029197693, "learning_rate": 0.0002, "epoch": 3.5620915032679736, "step": 5450}, {"loss": 1.2869, "grad_norm": 0.8743821978569031, "learning_rate": 0.0002, "epoch": 3.568627450980392, "step": 5460}, {"loss": 1.1018, "grad_norm": 0.7513086795806885, "learning_rate": 0.0002, "epoch": 3.57516339869281, "step": 5470}, {"loss": 1.1082, "grad_norm": 1.0139480829238892, "learning_rate": 0.0002, "epoch": 3.581699346405229, "step": 5480}, {"loss": 1.1706, "grad_norm": 0.6615135073661804, "learning_rate": 0.0002, "epoch": 3.588235294117647, "step": 5490}, {"loss": 1.3906, "grad_norm": 1.180798888206482, "learning_rate": 0.0002, "epoch": 3.5947712418300655, "step": 5500}, {"loss": 1.2391, "grad_norm": 0.7085279226303101, "learning_rate": 0.0002, "epoch": 3.6013071895424837, "step": 5510}, {"loss": 1.1623, "grad_norm": 0.540268063545227, "learning_rate": 0.0002, "epoch": 3.607843137254902, "step": 5520}, {"loss": 1.2132, "grad_norm": 0.7905671000480652, "learning_rate": 0.0002, "epoch": 3.6143790849673203, "step": 5530}, {"loss": 1.2731, "grad_norm": 0.8457717299461365, "learning_rate": 0.0002, "epoch": 3.6209150326797386, "step": 5540}, {"loss": 1.1799, "grad_norm": 0.7102677822113037, "learning_rate": 0.0002, "epoch": 3.627450980392157, "step": 5550}, {"loss": 1.2394, "grad_norm": 0.7179514765739441, "learning_rate": 0.0002, "epoch": 3.633986928104575, "step": 5560}, {"loss": 1.2019, "grad_norm": 1.0854148864746094, "learning_rate": 0.0002, "epoch": 3.6405228758169934, "step": 5570}, {"loss": 1.1986, "grad_norm": 0.8209951519966125, "learning_rate": 0.0002, "epoch": 3.6470588235294117, "step": 5580}, {"loss": 1.2289, "grad_norm": 0.6944138407707214, "learning_rate": 0.0002, "epoch": 3.65359477124183, "step": 5590}, {"loss": 1.3226, "grad_norm": 0.7675473093986511, "learning_rate": 0.0002, "epoch": 3.6601307189542482, "step": 5600}, {"loss": 1.2866, "grad_norm": 0.6683364510536194, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 5610}, {"loss": 1.1099, "grad_norm": 0.7920727133750916, "learning_rate": 0.0002, "epoch": 3.6732026143790852, "step": 5620}, {"loss": 1.2287, "grad_norm": 0.9440218806266785, "learning_rate": 0.0002, "epoch": 3.6797385620915035, "step": 5630}, {"loss": 1.2444, "grad_norm": 0.6600824594497681, "learning_rate": 0.0002, "epoch": 3.686274509803922, "step": 5640}, {"loss": 1.191, "grad_norm": 0.6860619187355042, "learning_rate": 0.0002, "epoch": 3.69281045751634, "step": 5650}, {"loss": 1.1914, "grad_norm": 0.6579713225364685, "learning_rate": 0.0002, "epoch": 3.6993464052287583, "step": 5660}, {"loss": 1.1464, "grad_norm": 0.661081075668335, "learning_rate": 0.0002, "epoch": 3.7058823529411766, "step": 5670}, {"loss": 1.289, "grad_norm": 1.0968825817108154, "learning_rate": 0.0002, "epoch": 3.712418300653595, "step": 5680}, {"loss": 1.192, "grad_norm": 0.8066844940185547, "learning_rate": 0.0002, "epoch": 3.718954248366013, "step": 5690}, {"loss": 1.2322, "grad_norm": 0.8341682553291321, "learning_rate": 0.0002, "epoch": 3.7254901960784315, "step": 5700}, {"loss": 1.1473, "grad_norm": 0.6682852506637573, "learning_rate": 0.0002, "epoch": 3.7320261437908497, "step": 5710}, {"loss": 1.1566, "grad_norm": 0.898595929145813, "learning_rate": 0.0002, "epoch": 3.738562091503268, "step": 5720}, {"loss": 1.0919, "grad_norm": 0.6876054406166077, "learning_rate": 0.0002, "epoch": 3.7450980392156863, "step": 5730}, {"loss": 1.2302, "grad_norm": 0.7817103266716003, "learning_rate": 0.0002, "epoch": 3.7516339869281046, "step": 5740}, {"loss": 1.2439, "grad_norm": 0.5840168595314026, "learning_rate": 0.0002, "epoch": 3.758169934640523, "step": 5750}, {"loss": 1.1279, "grad_norm": 0.6263918876647949, "learning_rate": 0.0002, "epoch": 3.764705882352941, "step": 5760}, {"loss": 1.2023, "grad_norm": 0.7948952317237854, "learning_rate": 0.0002, "epoch": 3.7712418300653594, "step": 5770}, {"loss": 1.149, "grad_norm": 0.6700998544692993, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 5780}, {"loss": 1.3207, "grad_norm": 1.1169519424438477, "learning_rate": 0.0002, "epoch": 3.784313725490196, "step": 5790}, {"loss": 1.064, "grad_norm": 0.8354471325874329, "learning_rate": 0.0002, "epoch": 3.7908496732026142, "step": 5800}, {"loss": 1.2104, "grad_norm": 0.6304181814193726, "learning_rate": 0.0002, "epoch": 3.7973856209150325, "step": 5810}, {"loss": 1.2059, "grad_norm": 0.6919655799865723, "learning_rate": 0.0002, "epoch": 3.803921568627451, "step": 5820}, {"loss": 1.217, "grad_norm": 0.600385844707489, "learning_rate": 0.0002, "epoch": 3.810457516339869, "step": 5830}, {"loss": 1.2324, "grad_norm": 0.8406319618225098, "learning_rate": 0.0002, "epoch": 3.8169934640522873, "step": 5840}, {"loss": 1.2418, "grad_norm": 0.7594282031059265, "learning_rate": 0.0002, "epoch": 3.8235294117647056, "step": 5850}, {"loss": 1.1903, "grad_norm": 0.8179879784584045, "learning_rate": 0.0002, "epoch": 3.8300653594771243, "step": 5860}, {"loss": 1.255, "grad_norm": 1.141430377960205, "learning_rate": 0.0002, "epoch": 3.8366013071895426, "step": 5870}, {"loss": 1.1467, "grad_norm": 0.6595550775527954, "learning_rate": 0.0002, "epoch": 3.843137254901961, "step": 5880}, {"loss": 1.2378, "grad_norm": 0.7499435544013977, "learning_rate": 0.0002, "epoch": 3.849673202614379, "step": 5890}, {"loss": 1.217, "grad_norm": 0.7851517200469971, "learning_rate": 0.0002, "epoch": 3.8562091503267975, "step": 5900}, {"loss": 1.162, "grad_norm": 1.0533545017242432, "learning_rate": 0.0002, "epoch": 3.8627450980392157, "step": 5910}, {"loss": 1.3576, "grad_norm": 0.960086464881897, "learning_rate": 0.0002, "epoch": 3.869281045751634, "step": 5920}, {"loss": 1.151, "grad_norm": 0.9952049851417542, "learning_rate": 0.0002, "epoch": 3.8758169934640523, "step": 5930}, {"loss": 1.2027, "grad_norm": 0.7884191274642944, "learning_rate": 0.0002, "epoch": 3.8823529411764706, "step": 5940}, {"loss": 1.1796, "grad_norm": 0.7461766600608826, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 5950}, {"loss": 1.2251, "grad_norm": 0.9594355821609497, "learning_rate": 0.0002, "epoch": 3.895424836601307, "step": 5960}, {"loss": 1.1164, "grad_norm": 0.8179471492767334, "learning_rate": 0.0002, "epoch": 3.9019607843137254, "step": 5970}, {"loss": 1.2421, "grad_norm": 0.8240267634391785, "learning_rate": 0.0002, "epoch": 3.9084967320261437, "step": 5980}, {"loss": 1.3076, "grad_norm": 0.7462618350982666, "learning_rate": 0.0002, "epoch": 3.915032679738562, "step": 5990}, {"loss": 1.2124, "grad_norm": 0.711207389831543, "learning_rate": 0.0002, "epoch": 3.9215686274509802, "step": 6000}, {"loss": 1.2119, "grad_norm": 0.6910956501960754, "learning_rate": 0.0002, "epoch": 3.928104575163399, "step": 6010}, {"loss": 1.2127, "grad_norm": 0.749093770980835, "learning_rate": 0.0002, "epoch": 3.9346405228758172, "step": 6020}, {"loss": 1.1542, "grad_norm": 1.3332762718200684, "learning_rate": 0.0002, "epoch": 3.9411764705882355, "step": 6030}, {"loss": 1.1442, "grad_norm": 0.71457439661026, "learning_rate": 0.0002, "epoch": 3.947712418300654, "step": 6040}, {"loss": 1.339, "grad_norm": 1.1205238103866577, "learning_rate": 0.0002, "epoch": 3.954248366013072, "step": 6050}, {"loss": 1.2962, "grad_norm": 0.6958928108215332, "learning_rate": 0.0002, "epoch": 3.9607843137254903, "step": 6060}, {"loss": 1.1802, "grad_norm": 0.7518056035041809, "learning_rate": 0.0002, "epoch": 3.9673202614379086, "step": 6070}, {"loss": 1.1179, "grad_norm": 0.8010755777359009, "learning_rate": 0.0002, "epoch": 3.973856209150327, "step": 6080}, {"loss": 1.2867, "grad_norm": 0.7492658495903015, "learning_rate": 0.0002, "epoch": 3.980392156862745, "step": 6090}, {"loss": 1.2113, "grad_norm": 0.900704562664032, "learning_rate": 0.0002, "epoch": 3.9869281045751634, "step": 6100}, {"loss": 1.1106, "grad_norm": 0.7997331619262695, "learning_rate": 0.0002, "epoch": 3.9934640522875817, "step": 6110}, {"loss": 1.1244, "grad_norm": 0.7163209319114685, "learning_rate": 0.0002, "epoch": 4.0, "step": 6120}, {"eval_loss": 1.4113320112228394, "eval_runtime": 33.7199, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.631, "epoch": 4.0, "step": 6120}, {"loss": 1.0423, "grad_norm": 0.9527022838592529, "learning_rate": 0.0002, "epoch": 4.006535947712418, "step": 6130}, {"loss": 1.101, "grad_norm": 0.7603210210800171, "learning_rate": 0.0002, "epoch": 4.0130718954248366, "step": 6140}, {"loss": 1.1834, "grad_norm": 1.127387523651123, "learning_rate": 0.0002, "epoch": 4.019607843137255, "step": 6150}, {"loss": 1.0734, "grad_norm": 0.8290133476257324, "learning_rate": 0.0002, "epoch": 4.026143790849673, "step": 6160}, {"loss": 1.0785, "grad_norm": 0.9912241101264954, "learning_rate": 0.0002, "epoch": 4.032679738562091, "step": 6170}, {"loss": 1.0719, "grad_norm": 0.947005033493042, "learning_rate": 0.0002, "epoch": 4.03921568627451, "step": 6180}, {"loss": 1.0835, "grad_norm": 0.707466185092926, "learning_rate": 0.0002, "epoch": 4.045751633986928, "step": 6190}, {"loss": 1.1079, "grad_norm": 1.0604327917099, "learning_rate": 0.0002, "epoch": 4.052287581699346, "step": 6200}, {"loss": 1.0375, "grad_norm": 0.7848685383796692, "learning_rate": 0.0002, "epoch": 4.0588235294117645, "step": 6210}, {"loss": 1.1167, "grad_norm": 0.8475256562232971, "learning_rate": 0.0002, "epoch": 4.065359477124183, "step": 6220}, {"loss": 1.1104, "grad_norm": 0.9759448766708374, "learning_rate": 0.0002, "epoch": 4.071895424836601, "step": 6230}, {"loss": 1.1538, "grad_norm": 0.9324519038200378, "learning_rate": 0.0002, "epoch": 4.078431372549019, "step": 6240}, {"loss": 1.0817, "grad_norm": 0.8723901510238647, "learning_rate": 0.0002, "epoch": 4.084967320261438, "step": 6250}, {"loss": 1.0977, "grad_norm": 0.8343415856361389, "learning_rate": 0.0002, "epoch": 4.091503267973856, "step": 6260}, {"loss": 0.9887, "grad_norm": 0.7490310072898865, "learning_rate": 0.0002, "epoch": 4.098039215686274, "step": 6270}, {"loss": 1.2084, "grad_norm": 0.8961182832717896, "learning_rate": 0.0002, "epoch": 4.104575163398692, "step": 6280}, {"loss": 1.1349, "grad_norm": 0.7124854922294617, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 6290}, {"loss": 1.0081, "grad_norm": 0.8338138461112976, "learning_rate": 0.0002, "epoch": 4.117647058823529, "step": 6300}, {"loss": 1.1091, "grad_norm": 0.8075833320617676, "learning_rate": 0.0002, "epoch": 4.124183006535947, "step": 6310}, {"loss": 1.0193, "grad_norm": 0.8069391846656799, "learning_rate": 0.0002, "epoch": 4.130718954248366, "step": 6320}, {"loss": 0.948, "grad_norm": 0.9567893147468567, "learning_rate": 0.0002, "epoch": 4.137254901960785, "step": 6330}, {"loss": 1.0241, "grad_norm": 1.2184662818908691, "learning_rate": 0.0002, "epoch": 4.143790849673203, "step": 6340}, {"loss": 1.0756, "grad_norm": 1.030976414680481, "learning_rate": 0.0002, "epoch": 4.150326797385621, "step": 6350}, {"loss": 1.1124, "grad_norm": 0.9749957323074341, "learning_rate": 0.0002, "epoch": 4.1568627450980395, "step": 6360}, {"loss": 1.1038, "grad_norm": 0.7089483141899109, "learning_rate": 0.0002, "epoch": 4.163398692810458, "step": 6370}, {"loss": 1.2175, "grad_norm": 1.1084946393966675, "learning_rate": 0.0002, "epoch": 4.169934640522876, "step": 6380}, {"loss": 1.0274, "grad_norm": 0.7998497486114502, "learning_rate": 0.0002, "epoch": 4.176470588235294, "step": 6390}, {"loss": 1.005, "grad_norm": 0.8997811675071716, "learning_rate": 0.0002, "epoch": 4.183006535947713, "step": 6400}, {"loss": 1.0704, "grad_norm": 0.8359479904174805, "learning_rate": 0.0002, "epoch": 4.189542483660131, "step": 6410}, {"loss": 1.1056, "grad_norm": 0.9087472558021545, "learning_rate": 0.0002, "epoch": 4.196078431372549, "step": 6420}, {"loss": 1.0657, "grad_norm": 1.1100451946258545, "learning_rate": 0.0002, "epoch": 4.2026143790849675, "step": 6430}, {"loss": 1.1443, "grad_norm": 0.9376999735832214, "learning_rate": 0.0002, "epoch": 4.209150326797386, "step": 6440}, {"loss": 1.0862, "grad_norm": 0.8179266452789307, "learning_rate": 0.0002, "epoch": 4.215686274509804, "step": 6450}, {"loss": 1.0679, "grad_norm": 0.9953271746635437, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 6460}, {"loss": 1.1034, "grad_norm": 0.8476650714874268, "learning_rate": 0.0002, "epoch": 4.228758169934641, "step": 6470}, {"loss": 1.2512, "grad_norm": 0.8406323194503784, "learning_rate": 0.0002, "epoch": 4.235294117647059, "step": 6480}, {"loss": 1.057, "grad_norm": 0.819134533405304, "learning_rate": 0.0002, "epoch": 4.241830065359477, "step": 6490}, {"loss": 1.1082, "grad_norm": 0.7764983773231506, "learning_rate": 0.0002, "epoch": 4.248366013071895, "step": 6500}, {"loss": 1.1593, "grad_norm": 0.8252112865447998, "learning_rate": 0.0002, "epoch": 4.254901960784314, "step": 6510}, {"loss": 1.1369, "grad_norm": 0.7941019535064697, "learning_rate": 0.0002, "epoch": 4.261437908496732, "step": 6520}, {"loss": 1.0296, "grad_norm": 0.7673905491828918, "learning_rate": 0.0002, "epoch": 4.26797385620915, "step": 6530}, {"loss": 1.1387, "grad_norm": 0.8749890327453613, "learning_rate": 0.0002, "epoch": 4.2745098039215685, "step": 6540}, {"loss": 1.0595, "grad_norm": 0.7343207597732544, "learning_rate": 0.0002, "epoch": 4.281045751633987, "step": 6550}, {"loss": 1.1715, "grad_norm": 1.2786651849746704, "learning_rate": 0.0002, "epoch": 4.287581699346405, "step": 6560}, {"loss": 1.0514, "grad_norm": 1.316875696182251, "learning_rate": 0.0002, "epoch": 4.294117647058823, "step": 6570}, {"loss": 1.1125, "grad_norm": 0.8349189162254333, "learning_rate": 0.0002, "epoch": 4.300653594771242, "step": 6580}, {"loss": 1.0732, "grad_norm": 0.7510647177696228, "learning_rate": 0.0002, "epoch": 4.30718954248366, "step": 6590}, {"loss": 1.1387, "grad_norm": 0.932420551776886, "learning_rate": 0.0002, "epoch": 4.313725490196078, "step": 6600}, {"loss": 1.1115, "grad_norm": 0.8510616421699524, "learning_rate": 0.0002, "epoch": 4.3202614379084965, "step": 6610}, {"loss": 1.0957, "grad_norm": 0.7661547064781189, "learning_rate": 0.0002, "epoch": 4.326797385620915, "step": 6620}, {"loss": 1.2064, "grad_norm": 1.0370930433273315, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 6630}, {"loss": 1.1064, "grad_norm": 0.9302158951759338, "learning_rate": 0.0002, "epoch": 4.339869281045751, "step": 6640}, {"loss": 0.968, "grad_norm": 0.9203811883926392, "learning_rate": 0.0002, "epoch": 4.34640522875817, "step": 6650}, {"loss": 1.0123, "grad_norm": 0.9986332654953003, "learning_rate": 0.0002, "epoch": 4.352941176470588, "step": 6660}, {"loss": 1.1079, "grad_norm": 0.8001713156700134, "learning_rate": 0.0002, "epoch": 4.359477124183006, "step": 6670}, {"loss": 1.0248, "grad_norm": 0.829714298248291, "learning_rate": 0.0002, "epoch": 4.366013071895424, "step": 6680}, {"loss": 1.0389, "grad_norm": 0.8253079056739807, "learning_rate": 0.0002, "epoch": 4.372549019607844, "step": 6690}, {"loss": 1.1087, "grad_norm": 0.824666440486908, "learning_rate": 0.0002, "epoch": 4.379084967320262, "step": 6700}, {"loss": 1.1968, "grad_norm": 0.8872972130775452, "learning_rate": 0.0002, "epoch": 4.38562091503268, "step": 6710}, {"loss": 1.0474, "grad_norm": 0.8729761838912964, "learning_rate": 0.0002, "epoch": 4.392156862745098, "step": 6720}, {"loss": 1.0961, "grad_norm": 1.1367264986038208, "learning_rate": 0.0002, "epoch": 4.398692810457517, "step": 6730}, {"loss": 1.0184, "grad_norm": 0.9699058532714844, "learning_rate": 0.0002, "epoch": 4.405228758169935, "step": 6740}, {"loss": 1.006, "grad_norm": 0.8266763687133789, "learning_rate": 0.0002, "epoch": 4.411764705882353, "step": 6750}, {"loss": 1.0735, "grad_norm": 1.0249767303466797, "learning_rate": 0.0002, "epoch": 4.4183006535947715, "step": 6760}, {"loss": 1.1726, "grad_norm": 0.73606938123703, "learning_rate": 0.0002, "epoch": 4.42483660130719, "step": 6770}, {"loss": 1.1037, "grad_norm": 1.4050679206848145, "learning_rate": 0.0002, "epoch": 4.431372549019608, "step": 6780}, {"loss": 1.1418, "grad_norm": 1.1114081144332886, "learning_rate": 0.0002, "epoch": 4.437908496732026, "step": 6790}, {"loss": 0.9682, "grad_norm": 0.8031067848205566, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6800}, {"loss": 1.0753, "grad_norm": 0.8513566851615906, "learning_rate": 0.0002, "epoch": 4.450980392156863, "step": 6810}, {"loss": 1.1852, "grad_norm": 1.332741379737854, "learning_rate": 0.0002, "epoch": 4.457516339869281, "step": 6820}, {"loss": 1.0966, "grad_norm": 1.5032578706741333, "learning_rate": 0.0002, "epoch": 4.4640522875816995, "step": 6830}, {"loss": 1.1124, "grad_norm": 0.7677283883094788, "learning_rate": 0.0002, "epoch": 4.470588235294118, "step": 6840}, {"loss": 1.1501, "grad_norm": 0.989148736000061, "learning_rate": 0.0002, "epoch": 4.477124183006536, "step": 6850}, {"loss": 1.2239, "grad_norm": 1.5316275358200073, "learning_rate": 0.0002, "epoch": 4.483660130718954, "step": 6860}, {"loss": 1.1171, "grad_norm": 0.9427124261856079, "learning_rate": 0.0002, "epoch": 4.490196078431373, "step": 6870}, {"loss": 1.1314, "grad_norm": 1.215287685394287, "learning_rate": 0.0002, "epoch": 4.496732026143791, "step": 6880}, {"loss": 1.0809, "grad_norm": 0.7286760210990906, "learning_rate": 0.0002, "epoch": 4.503267973856209, "step": 6890}, {"loss": 1.0179, "grad_norm": 0.874829888343811, "learning_rate": 0.0002, "epoch": 4.509803921568627, "step": 6900}, {"loss": 1.0233, "grad_norm": 0.8058359622955322, "learning_rate": 0.0002, "epoch": 4.516339869281046, "step": 6910}, {"loss": 1.0463, "grad_norm": 1.248195767402649, "learning_rate": 0.0002, "epoch": 4.522875816993464, "step": 6920}, {"loss": 1.0347, "grad_norm": 0.8033645749092102, "learning_rate": 0.0002, "epoch": 4.529411764705882, "step": 6930}, {"loss": 1.1068, "grad_norm": 1.7361950874328613, "learning_rate": 0.0002, "epoch": 4.5359477124183005, "step": 6940}, {"loss": 0.9856, "grad_norm": 0.8058095574378967, "learning_rate": 0.0002, "epoch": 4.542483660130719, "step": 6950}, {"loss": 1.0057, "grad_norm": 1.254089593887329, "learning_rate": 0.0002, "epoch": 4.549019607843137, "step": 6960}, {"loss": 1.1723, "grad_norm": 0.9180455803871155, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 6970}, {"loss": 1.0559, "grad_norm": 0.6677682399749756, "learning_rate": 0.0002, "epoch": 4.562091503267974, "step": 6980}, {"loss": 1.0453, "grad_norm": 0.8127354383468628, "learning_rate": 0.0002, "epoch": 4.568627450980392, "step": 6990}, {"loss": 1.0828, "grad_norm": 1.0263001918792725, "learning_rate": 0.0002, "epoch": 4.57516339869281, "step": 7000}, {"loss": 1.0703, "grad_norm": 0.9641909003257751, "learning_rate": 0.0002, "epoch": 4.5816993464052285, "step": 7010}, {"loss": 1.179, "grad_norm": 0.9440861344337463, "learning_rate": 0.0002, "epoch": 4.588235294117647, "step": 7020}, {"loss": 1.0931, "grad_norm": 0.9539011716842651, "learning_rate": 0.0002, "epoch": 4.594771241830065, "step": 7030}, {"loss": 1.0963, "grad_norm": 1.0449910163879395, "learning_rate": 0.0002, "epoch": 4.601307189542483, "step": 7040}, {"loss": 0.9944, "grad_norm": 0.8766893744468689, "learning_rate": 0.0002, "epoch": 4.607843137254902, "step": 7050}, {"loss": 1.0169, "grad_norm": 0.6983462572097778, "learning_rate": 0.0002, "epoch": 4.61437908496732, "step": 7060}, {"loss": 1.1778, "grad_norm": 0.9505505561828613, "learning_rate": 0.0002, "epoch": 4.620915032679738, "step": 7070}, {"loss": 1.121, "grad_norm": 1.2506657838821411, "learning_rate": 0.0002, "epoch": 4.627450980392156, "step": 7080}, {"loss": 1.1329, "grad_norm": 0.9602801203727722, "learning_rate": 0.0002, "epoch": 4.633986928104575, "step": 7090}, {"loss": 1.1499, "grad_norm": 0.7398977875709534, "learning_rate": 0.0002, "epoch": 4.640522875816993, "step": 7100}, {"loss": 1.0769, "grad_norm": 1.3862425088882446, "learning_rate": 0.0002, "epoch": 4.647058823529412, "step": 7110}, {"loss": 1.0571, "grad_norm": 1.1451990604400635, "learning_rate": 0.0002, "epoch": 4.65359477124183, "step": 7120}, {"loss": 1.1271, "grad_norm": 0.9010422229766846, "learning_rate": 0.0002, "epoch": 4.660130718954249, "step": 7130}, {"loss": 1.0165, "grad_norm": 0.7102518081665039, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 7140}, {"loss": 1.0819, "grad_norm": 0.7963796257972717, "learning_rate": 0.0002, "epoch": 4.673202614379085, "step": 7150}, {"loss": 1.1114, "grad_norm": 0.7726007699966431, "learning_rate": 0.0002, "epoch": 4.6797385620915035, "step": 7160}, {"loss": 1.2088, "grad_norm": 0.8097564578056335, "learning_rate": 0.0002, "epoch": 4.686274509803922, "step": 7170}, {"loss": 1.1386, "grad_norm": 0.9070925116539001, "learning_rate": 0.0002, "epoch": 4.69281045751634, "step": 7180}, {"loss": 1.0315, "grad_norm": 0.7543528079986572, "learning_rate": 0.0002, "epoch": 4.699346405228758, "step": 7190}, {"loss": 1.0984, "grad_norm": 0.9900904893875122, "learning_rate": 0.0002, "epoch": 4.705882352941177, "step": 7200}, {"loss": 1.1552, "grad_norm": 0.8033412098884583, "learning_rate": 0.0002, "epoch": 4.712418300653595, "step": 7210}, {"loss": 1.1773, "grad_norm": 0.8440839052200317, "learning_rate": 0.0002, "epoch": 4.718954248366013, "step": 7220}, {"loss": 1.1258, "grad_norm": 0.9325555562973022, "learning_rate": 0.0002, "epoch": 4.7254901960784315, "step": 7230}, {"loss": 1.1384, "grad_norm": 0.7881146669387817, "learning_rate": 0.0002, "epoch": 4.73202614379085, "step": 7240}, {"loss": 1.1219, "grad_norm": 0.884453296661377, "learning_rate": 0.0002, "epoch": 4.738562091503268, "step": 7250}, {"loss": 1.1036, "grad_norm": 0.9274539351463318, "learning_rate": 0.0002, "epoch": 4.745098039215686, "step": 7260}, {"loss": 1.0906, "grad_norm": 1.2367479801177979, "learning_rate": 0.0002, "epoch": 4.751633986928105, "step": 7270}, {"loss": 1.0741, "grad_norm": 0.9499821066856384, "learning_rate": 0.0002, "epoch": 4.758169934640523, "step": 7280}, {"loss": 1.1625, "grad_norm": 2.1918580532073975, "learning_rate": 0.0002, "epoch": 4.764705882352941, "step": 7290}, {"loss": 0.954, "grad_norm": 0.8221880793571472, "learning_rate": 0.0002, "epoch": 4.771241830065359, "step": 7300}, {"loss": 1.1358, "grad_norm": 0.871972918510437, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 7310}, {"loss": 1.0599, "grad_norm": 0.8034510612487793, "learning_rate": 0.0002, "epoch": 4.784313725490196, "step": 7320}, {"loss": 1.1059, "grad_norm": 0.8959605693817139, "learning_rate": 0.0002, "epoch": 4.790849673202614, "step": 7330}, {"loss": 1.0176, "grad_norm": 1.2326215505599976, "learning_rate": 0.0002, "epoch": 4.7973856209150325, "step": 7340}, {"loss": 1.1095, "grad_norm": 0.9725791811943054, "learning_rate": 0.0002, "epoch": 4.803921568627451, "step": 7350}, {"loss": 1.1229, "grad_norm": 0.7240816354751587, "learning_rate": 0.0002, "epoch": 4.810457516339869, "step": 7360}, {"loss": 1.0669, "grad_norm": 0.8265769481658936, "learning_rate": 0.0002, "epoch": 4.816993464052287, "step": 7370}, {"loss": 1.042, "grad_norm": 0.8888696432113647, "learning_rate": 0.0002, "epoch": 4.823529411764706, "step": 7380}, {"loss": 1.0981, "grad_norm": 0.7776556015014648, "learning_rate": 0.0002, "epoch": 4.830065359477124, "step": 7390}, {"loss": 1.0819, "grad_norm": 0.8772371411323547, "learning_rate": 0.0002, "epoch": 4.836601307189542, "step": 7400}, {"loss": 1.0819, "grad_norm": 0.9786531925201416, "learning_rate": 0.0002, "epoch": 4.8431372549019605, "step": 7410}, {"loss": 1.1358, "grad_norm": 0.9059745073318481, "learning_rate": 0.0002, "epoch": 4.849673202614379, "step": 7420}, {"loss": 1.0324, "grad_norm": 0.7422552108764648, "learning_rate": 0.0002, "epoch": 4.856209150326797, "step": 7430}, {"loss": 1.0423, "grad_norm": 1.3040380477905273, "learning_rate": 0.0002, "epoch": 4.862745098039216, "step": 7440}, {"loss": 1.1161, "grad_norm": 1.3278473615646362, "learning_rate": 0.0002, "epoch": 4.8692810457516345, "step": 7450}, {"loss": 1.0713, "grad_norm": 1.2705849409103394, "learning_rate": 0.0002, "epoch": 4.875816993464053, "step": 7460}, {"loss": 1.0034, "grad_norm": 0.8837892413139343, "learning_rate": 0.0002, "epoch": 4.882352941176471, "step": 7470}, {"loss": 1.1716, "grad_norm": 0.8670691251754761, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 7480}, {"loss": 1.1723, "grad_norm": 0.9662758111953735, "learning_rate": 0.0002, "epoch": 4.895424836601308, "step": 7490}, {"loss": 1.1056, "grad_norm": 0.8188302516937256, "learning_rate": 0.0002, "epoch": 4.901960784313726, "step": 7500}, {"loss": 1.0419, "grad_norm": 0.769442617893219, "learning_rate": 0.0002, "epoch": 4.908496732026144, "step": 7510}, {"loss": 1.1671, "grad_norm": 1.1465084552764893, "learning_rate": 0.0002, "epoch": 4.915032679738562, "step": 7520}, {"loss": 1.0768, "grad_norm": 1.253214955329895, "learning_rate": 0.0002, "epoch": 4.921568627450981, "step": 7530}, {"loss": 1.011, "grad_norm": 0.7922375202178955, "learning_rate": 0.0002, "epoch": 4.928104575163399, "step": 7540}, {"loss": 1.1256, "grad_norm": 0.8306851387023926, "learning_rate": 0.0002, "epoch": 4.934640522875817, "step": 7550}, {"loss": 1.206, "grad_norm": 0.8486151099205017, "learning_rate": 0.0002, "epoch": 4.9411764705882355, "step": 7560}, {"loss": 1.0161, "grad_norm": 1.2601467370986938, "learning_rate": 0.0002, "epoch": 4.947712418300654, "step": 7570}, {"loss": 1.1078, "grad_norm": 0.7980747818946838, "learning_rate": 0.0002, "epoch": 4.954248366013072, "step": 7580}, {"loss": 1.0607, "grad_norm": 0.8653254508972168, "learning_rate": 0.0002, "epoch": 4.96078431372549, "step": 7590}, {"loss": 1.0292, "grad_norm": 0.9680571556091309, "learning_rate": 0.0002, "epoch": 4.967320261437909, "step": 7600}, {"loss": 1.1795, "grad_norm": 0.9554466605186462, "learning_rate": 0.0002, "epoch": 4.973856209150327, "step": 7610}, {"loss": 1.0935, "grad_norm": 1.3693897724151611, "learning_rate": 0.0002, "epoch": 4.980392156862745, "step": 7620}, {"loss": 1.0838, "grad_norm": 0.7809282541275024, "learning_rate": 0.0002, "epoch": 4.9869281045751634, "step": 7630}, {"loss": 1.0844, "grad_norm": 0.7528006434440613, "learning_rate": 0.0002, "epoch": 4.993464052287582, "step": 7640}, {"loss": 0.9951, "grad_norm": 1.7491309642791748, "learning_rate": 0.0002, "epoch": 5.0, "step": 7650}]} +{"epoch": 6.0, "step": 9180, "epoch_duration": 1806.2337534427643, "total_accumulated_duration": 10291.616841077805, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}, {"eval_loss": 1.4131312370300293, "eval_runtime": 31.8967, "eval_samples_per_second": 13.669, "eval_steps_per_second": 1.724, "epoch": 3.0, "step": 4590}, {"loss": 1.1283, "grad_norm": 0.6926420331001282, "learning_rate": 0.0002, "epoch": 3.0065359477124183, "step": 4600}, {"loss": 1.1537, "grad_norm": 0.6656355857849121, "learning_rate": 0.0002, "epoch": 3.0130718954248366, "step": 4610}, {"loss": 1.308, "grad_norm": 0.9901936650276184, "learning_rate": 0.0002, "epoch": 3.019607843137255, "step": 4620}, {"loss": 1.22, "grad_norm": 0.6713474988937378, "learning_rate": 0.0002, "epoch": 3.026143790849673, "step": 4630}, {"loss": 1.2249, "grad_norm": 0.6199324131011963, "learning_rate": 0.0002, "epoch": 3.0326797385620914, "step": 4640}, {"loss": 1.242, "grad_norm": 0.7180785536766052, "learning_rate": 0.0002, "epoch": 3.0392156862745097, "step": 4650}, {"loss": 1.1349, "grad_norm": 0.8256588578224182, "learning_rate": 0.0002, "epoch": 3.045751633986928, "step": 4660}, {"loss": 1.1431, "grad_norm": 0.6637389063835144, "learning_rate": 0.0002, "epoch": 3.052287581699346, "step": 4670}, {"loss": 1.1096, "grad_norm": 0.6980698108673096, "learning_rate": 0.0002, "epoch": 3.0588235294117645, "step": 4680}, {"loss": 1.196, "grad_norm": 0.8091534972190857, "learning_rate": 0.0002, "epoch": 3.065359477124183, "step": 4690}, {"loss": 1.1652, "grad_norm": 0.5715174078941345, "learning_rate": 0.0002, "epoch": 3.0718954248366015, "step": 4700}, {"loss": 1.1427, "grad_norm": 0.735639750957489, "learning_rate": 0.0002, "epoch": 3.0784313725490198, "step": 4710}, {"loss": 1.1522, "grad_norm": 0.7619708180427551, "learning_rate": 0.0002, "epoch": 3.084967320261438, "step": 4720}, {"loss": 1.0853, "grad_norm": 1.263566017150879, "learning_rate": 0.0002, "epoch": 3.0915032679738563, "step": 4730}, {"loss": 1.1348, "grad_norm": 0.6600871682167053, "learning_rate": 0.0002, "epoch": 3.0980392156862746, "step": 4740}, {"loss": 1.1766, "grad_norm": 0.717792809009552, "learning_rate": 0.0002, "epoch": 3.104575163398693, "step": 4750}, {"loss": 1.088, "grad_norm": 0.853714644908905, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 4760}, {"loss": 1.2031, "grad_norm": 1.1004153490066528, "learning_rate": 0.0002, "epoch": 3.1176470588235294, "step": 4770}, {"loss": 1.3295, "grad_norm": 0.8566235899925232, "learning_rate": 0.0002, "epoch": 3.1241830065359477, "step": 4780}, {"loss": 1.2436, "grad_norm": 0.8315296173095703, "learning_rate": 0.0002, "epoch": 3.130718954248366, "step": 4790}, {"loss": 1.32, "grad_norm": 0.8020524978637695, "learning_rate": 0.0002, "epoch": 3.1372549019607843, "step": 4800}, {"loss": 1.1238, "grad_norm": 0.7564275860786438, "learning_rate": 0.0002, "epoch": 3.1437908496732025, "step": 4810}, {"loss": 1.1244, "grad_norm": 0.9077776670455933, "learning_rate": 0.0002, "epoch": 3.150326797385621, "step": 4820}, {"loss": 1.1399, "grad_norm": 0.6323099732398987, "learning_rate": 0.0002, "epoch": 3.156862745098039, "step": 4830}, {"loss": 1.1983, "grad_norm": 0.6625368595123291, "learning_rate": 0.0002, "epoch": 3.1633986928104574, "step": 4840}, {"loss": 1.066, "grad_norm": 0.8119261860847473, "learning_rate": 0.0002, "epoch": 3.1699346405228757, "step": 4850}, {"loss": 1.0224, "grad_norm": 0.6399450898170471, "learning_rate": 0.0002, "epoch": 3.176470588235294, "step": 4860}, {"loss": 1.2181, "grad_norm": 1.0659016370773315, "learning_rate": 0.0002, "epoch": 3.183006535947712, "step": 4870}, {"loss": 1.2914, "grad_norm": 0.8040369749069214, "learning_rate": 0.0002, "epoch": 3.189542483660131, "step": 4880}, {"loss": 1.1996, "grad_norm": 0.7784733176231384, "learning_rate": 0.0002, "epoch": 3.196078431372549, "step": 4890}, {"loss": 1.2051, "grad_norm": 0.9660294651985168, "learning_rate": 0.0002, "epoch": 3.2026143790849675, "step": 4900}, {"loss": 1.0419, "grad_norm": 1.0676977634429932, "learning_rate": 0.0002, "epoch": 3.2091503267973858, "step": 4910}, {"loss": 1.0083, "grad_norm": 0.5877565741539001, "learning_rate": 0.0002, "epoch": 3.215686274509804, "step": 4920}, {"loss": 1.1046, "grad_norm": 0.6164032816886902, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 4930}, {"loss": 1.1079, "grad_norm": 0.7627606987953186, "learning_rate": 0.0002, "epoch": 3.2287581699346406, "step": 4940}, {"loss": 1.2453, "grad_norm": 0.7442803978919983, "learning_rate": 0.0002, "epoch": 3.235294117647059, "step": 4950}, {"loss": 1.1087, "grad_norm": 0.7277812361717224, "learning_rate": 0.0002, "epoch": 3.241830065359477, "step": 4960}, {"loss": 1.2237, "grad_norm": 1.0301902294158936, "learning_rate": 0.0002, "epoch": 3.2483660130718954, "step": 4970}, {"loss": 1.1466, "grad_norm": 0.7798232436180115, "learning_rate": 0.0002, "epoch": 3.2549019607843137, "step": 4980}, {"loss": 1.2142, "grad_norm": 1.210265874862671, "learning_rate": 0.0002, "epoch": 3.261437908496732, "step": 4990}, {"loss": 1.1557, "grad_norm": 0.6677713990211487, "learning_rate": 0.0002, "epoch": 3.2679738562091503, "step": 5000}, {"loss": 1.3294, "grad_norm": 1.0524500608444214, "learning_rate": 0.0002, "epoch": 3.2745098039215685, "step": 5010}, {"loss": 1.1939, "grad_norm": 0.7091745734214783, "learning_rate": 0.0002, "epoch": 3.281045751633987, "step": 5020}, {"loss": 1.1891, "grad_norm": 0.8523224592208862, "learning_rate": 0.0002, "epoch": 3.287581699346405, "step": 5030}, {"loss": 1.1925, "grad_norm": 0.6120608448982239, "learning_rate": 0.0002, "epoch": 3.2941176470588234, "step": 5040}, {"loss": 1.0603, "grad_norm": 0.7437472939491272, "learning_rate": 0.0002, "epoch": 3.3006535947712417, "step": 5050}, {"loss": 1.1295, "grad_norm": 0.7611715197563171, "learning_rate": 0.0002, "epoch": 3.30718954248366, "step": 5060}, {"loss": 1.0531, "grad_norm": 0.7249704003334045, "learning_rate": 0.0002, "epoch": 3.313725490196078, "step": 5070}, {"loss": 1.2292, "grad_norm": 0.7316247820854187, "learning_rate": 0.0002, "epoch": 3.3202614379084965, "step": 5080}, {"loss": 1.1974, "grad_norm": 0.562412440776825, "learning_rate": 0.0002, "epoch": 3.326797385620915, "step": 5090}, {"loss": 1.0736, "grad_norm": 0.7052176594734192, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 5100}, {"loss": 1.122, "grad_norm": 0.7714211344718933, "learning_rate": 0.0002, "epoch": 3.3398692810457518, "step": 5110}, {"loss": 1.1684, "grad_norm": 1.0436055660247803, "learning_rate": 0.0002, "epoch": 3.34640522875817, "step": 5120}, {"loss": 1.0945, "grad_norm": 0.8867271542549133, "learning_rate": 0.0002, "epoch": 3.3529411764705883, "step": 5130}, {"loss": 1.159, "grad_norm": 0.8371267914772034, "learning_rate": 0.0002, "epoch": 3.3594771241830066, "step": 5140}, {"loss": 1.1073, "grad_norm": 0.7257837057113647, "learning_rate": 0.0002, "epoch": 3.366013071895425, "step": 5150}, {"loss": 1.1162, "grad_norm": 0.7102002501487732, "learning_rate": 0.0002, "epoch": 3.372549019607843, "step": 5160}, {"loss": 1.2056, "grad_norm": 0.7636350393295288, "learning_rate": 0.0002, "epoch": 3.3790849673202614, "step": 5170}, {"loss": 1.0708, "grad_norm": 0.6887359619140625, "learning_rate": 0.0002, "epoch": 3.3856209150326797, "step": 5180}, {"loss": 1.3807, "grad_norm": 0.8141424655914307, "learning_rate": 0.0002, "epoch": 3.392156862745098, "step": 5190}, {"loss": 1.1986, "grad_norm": 0.694423496723175, "learning_rate": 0.0002, "epoch": 3.3986928104575163, "step": 5200}, {"loss": 1.2945, "grad_norm": 0.914013683795929, "learning_rate": 0.0002, "epoch": 3.4052287581699345, "step": 5210}, {"loss": 1.1413, "grad_norm": 0.8503239750862122, "learning_rate": 0.0002, "epoch": 3.411764705882353, "step": 5220}, {"loss": 1.2696, "grad_norm": 0.6196836233139038, "learning_rate": 0.0002, "epoch": 3.418300653594771, "step": 5230}, {"loss": 1.2431, "grad_norm": 1.0760811567306519, "learning_rate": 0.0002, "epoch": 3.4248366013071894, "step": 5240}, {"loss": 1.1686, "grad_norm": 0.6524698138237, "learning_rate": 0.0002, "epoch": 3.431372549019608, "step": 5250}, {"loss": 1.2012, "grad_norm": 0.674467921257019, "learning_rate": 0.0002, "epoch": 3.4379084967320264, "step": 5260}, {"loss": 1.1015, "grad_norm": 0.7690372467041016, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 5270}, {"loss": 1.2511, "grad_norm": 0.8751813769340515, "learning_rate": 0.0002, "epoch": 3.450980392156863, "step": 5280}, {"loss": 1.1841, "grad_norm": 0.750407874584198, "learning_rate": 0.0002, "epoch": 3.457516339869281, "step": 5290}, {"loss": 1.0605, "grad_norm": 0.5991823077201843, "learning_rate": 0.0002, "epoch": 3.4640522875816995, "step": 5300}, {"loss": 1.2347, "grad_norm": 1.0164772272109985, "learning_rate": 0.0002, "epoch": 3.4705882352941178, "step": 5310}, {"loss": 1.2354, "grad_norm": 0.8704105019569397, "learning_rate": 0.0002, "epoch": 3.477124183006536, "step": 5320}, {"loss": 1.2169, "grad_norm": 0.709102213382721, "learning_rate": 0.0002, "epoch": 3.4836601307189543, "step": 5330}, {"loss": 1.2425, "grad_norm": 0.6273632049560547, "learning_rate": 0.0002, "epoch": 3.4901960784313726, "step": 5340}, {"loss": 1.1585, "grad_norm": 0.6807359457015991, "learning_rate": 0.0002, "epoch": 3.496732026143791, "step": 5350}, {"loss": 1.131, "grad_norm": 0.7085188627243042, "learning_rate": 0.0002, "epoch": 3.503267973856209, "step": 5360}, {"loss": 1.1159, "grad_norm": 0.6938307881355286, "learning_rate": 0.0002, "epoch": 3.5098039215686274, "step": 5370}, {"loss": 1.1397, "grad_norm": 0.8544146418571472, "learning_rate": 0.0002, "epoch": 3.5163398692810457, "step": 5380}, {"loss": 1.2181, "grad_norm": 0.7889642119407654, "learning_rate": 0.0002, "epoch": 3.522875816993464, "step": 5390}, {"loss": 1.1691, "grad_norm": 0.7858421206474304, "learning_rate": 0.0002, "epoch": 3.5294117647058822, "step": 5400}, {"loss": 1.2374, "grad_norm": 0.8547123074531555, "learning_rate": 0.0002, "epoch": 3.5359477124183005, "step": 5410}, {"loss": 1.196, "grad_norm": 0.8218181133270264, "learning_rate": 0.0002, "epoch": 3.542483660130719, "step": 5420}, {"loss": 1.1961, "grad_norm": 1.153623342514038, "learning_rate": 0.0002, "epoch": 3.549019607843137, "step": 5430}, {"loss": 1.156, "grad_norm": 1.1321099996566772, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 5440}, {"loss": 1.2224, "grad_norm": 0.9495334029197693, "learning_rate": 0.0002, "epoch": 3.5620915032679736, "step": 5450}, {"loss": 1.2869, "grad_norm": 0.8743821978569031, "learning_rate": 0.0002, "epoch": 3.568627450980392, "step": 5460}, {"loss": 1.1018, "grad_norm": 0.7513086795806885, "learning_rate": 0.0002, "epoch": 3.57516339869281, "step": 5470}, {"loss": 1.1082, "grad_norm": 1.0139480829238892, "learning_rate": 0.0002, "epoch": 3.581699346405229, "step": 5480}, {"loss": 1.1706, "grad_norm": 0.6615135073661804, "learning_rate": 0.0002, "epoch": 3.588235294117647, "step": 5490}, {"loss": 1.3906, "grad_norm": 1.180798888206482, "learning_rate": 0.0002, "epoch": 3.5947712418300655, "step": 5500}, {"loss": 1.2391, "grad_norm": 0.7085279226303101, "learning_rate": 0.0002, "epoch": 3.6013071895424837, "step": 5510}, {"loss": 1.1623, "grad_norm": 0.540268063545227, "learning_rate": 0.0002, "epoch": 3.607843137254902, "step": 5520}, {"loss": 1.2132, "grad_norm": 0.7905671000480652, "learning_rate": 0.0002, "epoch": 3.6143790849673203, "step": 5530}, {"loss": 1.2731, "grad_norm": 0.8457717299461365, "learning_rate": 0.0002, "epoch": 3.6209150326797386, "step": 5540}, {"loss": 1.1799, "grad_norm": 0.7102677822113037, "learning_rate": 0.0002, "epoch": 3.627450980392157, "step": 5550}, {"loss": 1.2394, "grad_norm": 0.7179514765739441, "learning_rate": 0.0002, "epoch": 3.633986928104575, "step": 5560}, {"loss": 1.2019, "grad_norm": 1.0854148864746094, "learning_rate": 0.0002, "epoch": 3.6405228758169934, "step": 5570}, {"loss": 1.1986, "grad_norm": 0.8209951519966125, "learning_rate": 0.0002, "epoch": 3.6470588235294117, "step": 5580}, {"loss": 1.2289, "grad_norm": 0.6944138407707214, "learning_rate": 0.0002, "epoch": 3.65359477124183, "step": 5590}, {"loss": 1.3226, "grad_norm": 0.7675473093986511, "learning_rate": 0.0002, "epoch": 3.6601307189542482, "step": 5600}, {"loss": 1.2866, "grad_norm": 0.6683364510536194, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 5610}, {"loss": 1.1099, "grad_norm": 0.7920727133750916, "learning_rate": 0.0002, "epoch": 3.6732026143790852, "step": 5620}, {"loss": 1.2287, "grad_norm": 0.9440218806266785, "learning_rate": 0.0002, "epoch": 3.6797385620915035, "step": 5630}, {"loss": 1.2444, "grad_norm": 0.6600824594497681, "learning_rate": 0.0002, "epoch": 3.686274509803922, "step": 5640}, {"loss": 1.191, "grad_norm": 0.6860619187355042, "learning_rate": 0.0002, "epoch": 3.69281045751634, "step": 5650}, {"loss": 1.1914, "grad_norm": 0.6579713225364685, "learning_rate": 0.0002, "epoch": 3.6993464052287583, "step": 5660}, {"loss": 1.1464, "grad_norm": 0.661081075668335, "learning_rate": 0.0002, "epoch": 3.7058823529411766, "step": 5670}, {"loss": 1.289, "grad_norm": 1.0968825817108154, "learning_rate": 0.0002, "epoch": 3.712418300653595, "step": 5680}, {"loss": 1.192, "grad_norm": 0.8066844940185547, "learning_rate": 0.0002, "epoch": 3.718954248366013, "step": 5690}, {"loss": 1.2322, "grad_norm": 0.8341682553291321, "learning_rate": 0.0002, "epoch": 3.7254901960784315, "step": 5700}, {"loss": 1.1473, "grad_norm": 0.6682852506637573, "learning_rate": 0.0002, "epoch": 3.7320261437908497, "step": 5710}, {"loss": 1.1566, "grad_norm": 0.898595929145813, "learning_rate": 0.0002, "epoch": 3.738562091503268, "step": 5720}, {"loss": 1.0919, "grad_norm": 0.6876054406166077, "learning_rate": 0.0002, "epoch": 3.7450980392156863, "step": 5730}, {"loss": 1.2302, "grad_norm": 0.7817103266716003, "learning_rate": 0.0002, "epoch": 3.7516339869281046, "step": 5740}, {"loss": 1.2439, "grad_norm": 0.5840168595314026, "learning_rate": 0.0002, "epoch": 3.758169934640523, "step": 5750}, {"loss": 1.1279, "grad_norm": 0.6263918876647949, "learning_rate": 0.0002, "epoch": 3.764705882352941, "step": 5760}, {"loss": 1.2023, "grad_norm": 0.7948952317237854, "learning_rate": 0.0002, "epoch": 3.7712418300653594, "step": 5770}, {"loss": 1.149, "grad_norm": 0.6700998544692993, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 5780}, {"loss": 1.3207, "grad_norm": 1.1169519424438477, "learning_rate": 0.0002, "epoch": 3.784313725490196, "step": 5790}, {"loss": 1.064, "grad_norm": 0.8354471325874329, "learning_rate": 0.0002, "epoch": 3.7908496732026142, "step": 5800}, {"loss": 1.2104, "grad_norm": 0.6304181814193726, "learning_rate": 0.0002, "epoch": 3.7973856209150325, "step": 5810}, {"loss": 1.2059, "grad_norm": 0.6919655799865723, "learning_rate": 0.0002, "epoch": 3.803921568627451, "step": 5820}, {"loss": 1.217, "grad_norm": 0.600385844707489, "learning_rate": 0.0002, "epoch": 3.810457516339869, "step": 5830}, {"loss": 1.2324, "grad_norm": 0.8406319618225098, "learning_rate": 0.0002, "epoch": 3.8169934640522873, "step": 5840}, {"loss": 1.2418, "grad_norm": 0.7594282031059265, "learning_rate": 0.0002, "epoch": 3.8235294117647056, "step": 5850}, {"loss": 1.1903, "grad_norm": 0.8179879784584045, "learning_rate": 0.0002, "epoch": 3.8300653594771243, "step": 5860}, {"loss": 1.255, "grad_norm": 1.141430377960205, "learning_rate": 0.0002, "epoch": 3.8366013071895426, "step": 5870}, {"loss": 1.1467, "grad_norm": 0.6595550775527954, "learning_rate": 0.0002, "epoch": 3.843137254901961, "step": 5880}, {"loss": 1.2378, "grad_norm": 0.7499435544013977, "learning_rate": 0.0002, "epoch": 3.849673202614379, "step": 5890}, {"loss": 1.217, "grad_norm": 0.7851517200469971, "learning_rate": 0.0002, "epoch": 3.8562091503267975, "step": 5900}, {"loss": 1.162, "grad_norm": 1.0533545017242432, "learning_rate": 0.0002, "epoch": 3.8627450980392157, "step": 5910}, {"loss": 1.3576, "grad_norm": 0.960086464881897, "learning_rate": 0.0002, "epoch": 3.869281045751634, "step": 5920}, {"loss": 1.151, "grad_norm": 0.9952049851417542, "learning_rate": 0.0002, "epoch": 3.8758169934640523, "step": 5930}, {"loss": 1.2027, "grad_norm": 0.7884191274642944, "learning_rate": 0.0002, "epoch": 3.8823529411764706, "step": 5940}, {"loss": 1.1796, "grad_norm": 0.7461766600608826, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 5950}, {"loss": 1.2251, "grad_norm": 0.9594355821609497, "learning_rate": 0.0002, "epoch": 3.895424836601307, "step": 5960}, {"loss": 1.1164, "grad_norm": 0.8179471492767334, "learning_rate": 0.0002, "epoch": 3.9019607843137254, "step": 5970}, {"loss": 1.2421, "grad_norm": 0.8240267634391785, "learning_rate": 0.0002, "epoch": 3.9084967320261437, "step": 5980}, {"loss": 1.3076, "grad_norm": 0.7462618350982666, "learning_rate": 0.0002, "epoch": 3.915032679738562, "step": 5990}, {"loss": 1.2124, "grad_norm": 0.711207389831543, "learning_rate": 0.0002, "epoch": 3.9215686274509802, "step": 6000}, {"loss": 1.2119, "grad_norm": 0.6910956501960754, "learning_rate": 0.0002, "epoch": 3.928104575163399, "step": 6010}, {"loss": 1.2127, "grad_norm": 0.749093770980835, "learning_rate": 0.0002, "epoch": 3.9346405228758172, "step": 6020}, {"loss": 1.1542, "grad_norm": 1.3332762718200684, "learning_rate": 0.0002, "epoch": 3.9411764705882355, "step": 6030}, {"loss": 1.1442, "grad_norm": 0.71457439661026, "learning_rate": 0.0002, "epoch": 3.947712418300654, "step": 6040}, {"loss": 1.339, "grad_norm": 1.1205238103866577, "learning_rate": 0.0002, "epoch": 3.954248366013072, "step": 6050}, {"loss": 1.2962, "grad_norm": 0.6958928108215332, "learning_rate": 0.0002, "epoch": 3.9607843137254903, "step": 6060}, {"loss": 1.1802, "grad_norm": 0.7518056035041809, "learning_rate": 0.0002, "epoch": 3.9673202614379086, "step": 6070}, {"loss": 1.1179, "grad_norm": 0.8010755777359009, "learning_rate": 0.0002, "epoch": 3.973856209150327, "step": 6080}, {"loss": 1.2867, "grad_norm": 0.7492658495903015, "learning_rate": 0.0002, "epoch": 3.980392156862745, "step": 6090}, {"loss": 1.2113, "grad_norm": 0.900704562664032, "learning_rate": 0.0002, "epoch": 3.9869281045751634, "step": 6100}, {"loss": 1.1106, "grad_norm": 0.7997331619262695, "learning_rate": 0.0002, "epoch": 3.9934640522875817, "step": 6110}, {"loss": 1.1244, "grad_norm": 0.7163209319114685, "learning_rate": 0.0002, "epoch": 4.0, "step": 6120}, {"eval_loss": 1.4113320112228394, "eval_runtime": 33.7199, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.631, "epoch": 4.0, "step": 6120}, {"loss": 1.0423, "grad_norm": 0.9527022838592529, "learning_rate": 0.0002, "epoch": 4.006535947712418, "step": 6130}, {"loss": 1.101, "grad_norm": 0.7603210210800171, "learning_rate": 0.0002, "epoch": 4.0130718954248366, "step": 6140}, {"loss": 1.1834, "grad_norm": 1.127387523651123, "learning_rate": 0.0002, "epoch": 4.019607843137255, "step": 6150}, {"loss": 1.0734, "grad_norm": 0.8290133476257324, "learning_rate": 0.0002, "epoch": 4.026143790849673, "step": 6160}, {"loss": 1.0785, "grad_norm": 0.9912241101264954, "learning_rate": 0.0002, "epoch": 4.032679738562091, "step": 6170}, {"loss": 1.0719, "grad_norm": 0.947005033493042, "learning_rate": 0.0002, "epoch": 4.03921568627451, "step": 6180}, {"loss": 1.0835, "grad_norm": 0.707466185092926, "learning_rate": 0.0002, "epoch": 4.045751633986928, "step": 6190}, {"loss": 1.1079, "grad_norm": 1.0604327917099, "learning_rate": 0.0002, "epoch": 4.052287581699346, "step": 6200}, {"loss": 1.0375, "grad_norm": 0.7848685383796692, "learning_rate": 0.0002, "epoch": 4.0588235294117645, "step": 6210}, {"loss": 1.1167, "grad_norm": 0.8475256562232971, "learning_rate": 0.0002, "epoch": 4.065359477124183, "step": 6220}, {"loss": 1.1104, "grad_norm": 0.9759448766708374, "learning_rate": 0.0002, "epoch": 4.071895424836601, "step": 6230}, {"loss": 1.1538, "grad_norm": 0.9324519038200378, "learning_rate": 0.0002, "epoch": 4.078431372549019, "step": 6240}, {"loss": 1.0817, "grad_norm": 0.8723901510238647, "learning_rate": 0.0002, "epoch": 4.084967320261438, "step": 6250}, {"loss": 1.0977, "grad_norm": 0.8343415856361389, "learning_rate": 0.0002, "epoch": 4.091503267973856, "step": 6260}, {"loss": 0.9887, "grad_norm": 0.7490310072898865, "learning_rate": 0.0002, "epoch": 4.098039215686274, "step": 6270}, {"loss": 1.2084, "grad_norm": 0.8961182832717896, "learning_rate": 0.0002, "epoch": 4.104575163398692, "step": 6280}, {"loss": 1.1349, "grad_norm": 0.7124854922294617, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 6290}, {"loss": 1.0081, "grad_norm": 0.8338138461112976, "learning_rate": 0.0002, "epoch": 4.117647058823529, "step": 6300}, {"loss": 1.1091, "grad_norm": 0.8075833320617676, "learning_rate": 0.0002, "epoch": 4.124183006535947, "step": 6310}, {"loss": 1.0193, "grad_norm": 0.8069391846656799, "learning_rate": 0.0002, "epoch": 4.130718954248366, "step": 6320}, {"loss": 0.948, "grad_norm": 0.9567893147468567, "learning_rate": 0.0002, "epoch": 4.137254901960785, "step": 6330}, {"loss": 1.0241, "grad_norm": 1.2184662818908691, "learning_rate": 0.0002, "epoch": 4.143790849673203, "step": 6340}, {"loss": 1.0756, "grad_norm": 1.030976414680481, "learning_rate": 0.0002, "epoch": 4.150326797385621, "step": 6350}, {"loss": 1.1124, "grad_norm": 0.9749957323074341, "learning_rate": 0.0002, "epoch": 4.1568627450980395, "step": 6360}, {"loss": 1.1038, "grad_norm": 0.7089483141899109, "learning_rate": 0.0002, "epoch": 4.163398692810458, "step": 6370}, {"loss": 1.2175, "grad_norm": 1.1084946393966675, "learning_rate": 0.0002, "epoch": 4.169934640522876, "step": 6380}, {"loss": 1.0274, "grad_norm": 0.7998497486114502, "learning_rate": 0.0002, "epoch": 4.176470588235294, "step": 6390}, {"loss": 1.005, "grad_norm": 0.8997811675071716, "learning_rate": 0.0002, "epoch": 4.183006535947713, "step": 6400}, {"loss": 1.0704, "grad_norm": 0.8359479904174805, "learning_rate": 0.0002, "epoch": 4.189542483660131, "step": 6410}, {"loss": 1.1056, "grad_norm": 0.9087472558021545, "learning_rate": 0.0002, "epoch": 4.196078431372549, "step": 6420}, {"loss": 1.0657, "grad_norm": 1.1100451946258545, "learning_rate": 0.0002, "epoch": 4.2026143790849675, "step": 6430}, {"loss": 1.1443, "grad_norm": 0.9376999735832214, "learning_rate": 0.0002, "epoch": 4.209150326797386, "step": 6440}, {"loss": 1.0862, "grad_norm": 0.8179266452789307, "learning_rate": 0.0002, "epoch": 4.215686274509804, "step": 6450}, {"loss": 1.0679, "grad_norm": 0.9953271746635437, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 6460}, {"loss": 1.1034, "grad_norm": 0.8476650714874268, "learning_rate": 0.0002, "epoch": 4.228758169934641, "step": 6470}, {"loss": 1.2512, "grad_norm": 0.8406323194503784, "learning_rate": 0.0002, "epoch": 4.235294117647059, "step": 6480}, {"loss": 1.057, "grad_norm": 0.819134533405304, "learning_rate": 0.0002, "epoch": 4.241830065359477, "step": 6490}, {"loss": 1.1082, "grad_norm": 0.7764983773231506, "learning_rate": 0.0002, "epoch": 4.248366013071895, "step": 6500}, {"loss": 1.1593, "grad_norm": 0.8252112865447998, "learning_rate": 0.0002, "epoch": 4.254901960784314, "step": 6510}, {"loss": 1.1369, "grad_norm": 0.7941019535064697, "learning_rate": 0.0002, "epoch": 4.261437908496732, "step": 6520}, {"loss": 1.0296, "grad_norm": 0.7673905491828918, "learning_rate": 0.0002, "epoch": 4.26797385620915, "step": 6530}, {"loss": 1.1387, "grad_norm": 0.8749890327453613, "learning_rate": 0.0002, "epoch": 4.2745098039215685, "step": 6540}, {"loss": 1.0595, "grad_norm": 0.7343207597732544, "learning_rate": 0.0002, "epoch": 4.281045751633987, "step": 6550}, {"loss": 1.1715, "grad_norm": 1.2786651849746704, "learning_rate": 0.0002, "epoch": 4.287581699346405, "step": 6560}, {"loss": 1.0514, "grad_norm": 1.316875696182251, "learning_rate": 0.0002, "epoch": 4.294117647058823, "step": 6570}, {"loss": 1.1125, "grad_norm": 0.8349189162254333, "learning_rate": 0.0002, "epoch": 4.300653594771242, "step": 6580}, {"loss": 1.0732, "grad_norm": 0.7510647177696228, "learning_rate": 0.0002, "epoch": 4.30718954248366, "step": 6590}, {"loss": 1.1387, "grad_norm": 0.932420551776886, "learning_rate": 0.0002, "epoch": 4.313725490196078, "step": 6600}, {"loss": 1.1115, "grad_norm": 0.8510616421699524, "learning_rate": 0.0002, "epoch": 4.3202614379084965, "step": 6610}, {"loss": 1.0957, "grad_norm": 0.7661547064781189, "learning_rate": 0.0002, "epoch": 4.326797385620915, "step": 6620}, {"loss": 1.2064, "grad_norm": 1.0370930433273315, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 6630}, {"loss": 1.1064, "grad_norm": 0.9302158951759338, "learning_rate": 0.0002, "epoch": 4.339869281045751, "step": 6640}, {"loss": 0.968, "grad_norm": 0.9203811883926392, "learning_rate": 0.0002, "epoch": 4.34640522875817, "step": 6650}, {"loss": 1.0123, "grad_norm": 0.9986332654953003, "learning_rate": 0.0002, "epoch": 4.352941176470588, "step": 6660}, {"loss": 1.1079, "grad_norm": 0.8001713156700134, "learning_rate": 0.0002, "epoch": 4.359477124183006, "step": 6670}, {"loss": 1.0248, "grad_norm": 0.829714298248291, "learning_rate": 0.0002, "epoch": 4.366013071895424, "step": 6680}, {"loss": 1.0389, "grad_norm": 0.8253079056739807, "learning_rate": 0.0002, "epoch": 4.372549019607844, "step": 6690}, {"loss": 1.1087, "grad_norm": 0.824666440486908, "learning_rate": 0.0002, "epoch": 4.379084967320262, "step": 6700}, {"loss": 1.1968, "grad_norm": 0.8872972130775452, "learning_rate": 0.0002, "epoch": 4.38562091503268, "step": 6710}, {"loss": 1.0474, "grad_norm": 0.8729761838912964, "learning_rate": 0.0002, "epoch": 4.392156862745098, "step": 6720}, {"loss": 1.0961, "grad_norm": 1.1367264986038208, "learning_rate": 0.0002, "epoch": 4.398692810457517, "step": 6730}, {"loss": 1.0184, "grad_norm": 0.9699058532714844, "learning_rate": 0.0002, "epoch": 4.405228758169935, "step": 6740}, {"loss": 1.006, "grad_norm": 0.8266763687133789, "learning_rate": 0.0002, "epoch": 4.411764705882353, "step": 6750}, {"loss": 1.0735, "grad_norm": 1.0249767303466797, "learning_rate": 0.0002, "epoch": 4.4183006535947715, "step": 6760}, {"loss": 1.1726, "grad_norm": 0.73606938123703, "learning_rate": 0.0002, "epoch": 4.42483660130719, "step": 6770}, {"loss": 1.1037, "grad_norm": 1.4050679206848145, "learning_rate": 0.0002, "epoch": 4.431372549019608, "step": 6780}, {"loss": 1.1418, "grad_norm": 1.1114081144332886, "learning_rate": 0.0002, "epoch": 4.437908496732026, "step": 6790}, {"loss": 0.9682, "grad_norm": 0.8031067848205566, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6800}, {"loss": 1.0753, "grad_norm": 0.8513566851615906, "learning_rate": 0.0002, "epoch": 4.450980392156863, "step": 6810}, {"loss": 1.1852, "grad_norm": 1.332741379737854, "learning_rate": 0.0002, "epoch": 4.457516339869281, "step": 6820}, {"loss": 1.0966, "grad_norm": 1.5032578706741333, "learning_rate": 0.0002, "epoch": 4.4640522875816995, "step": 6830}, {"loss": 1.1124, "grad_norm": 0.7677283883094788, "learning_rate": 0.0002, "epoch": 4.470588235294118, "step": 6840}, {"loss": 1.1501, "grad_norm": 0.989148736000061, "learning_rate": 0.0002, "epoch": 4.477124183006536, "step": 6850}, {"loss": 1.2239, "grad_norm": 1.5316275358200073, "learning_rate": 0.0002, "epoch": 4.483660130718954, "step": 6860}, {"loss": 1.1171, "grad_norm": 0.9427124261856079, "learning_rate": 0.0002, "epoch": 4.490196078431373, "step": 6870}, {"loss": 1.1314, "grad_norm": 1.215287685394287, "learning_rate": 0.0002, "epoch": 4.496732026143791, "step": 6880}, {"loss": 1.0809, "grad_norm": 0.7286760210990906, "learning_rate": 0.0002, "epoch": 4.503267973856209, "step": 6890}, {"loss": 1.0179, "grad_norm": 0.874829888343811, "learning_rate": 0.0002, "epoch": 4.509803921568627, "step": 6900}, {"loss": 1.0233, "grad_norm": 0.8058359622955322, "learning_rate": 0.0002, "epoch": 4.516339869281046, "step": 6910}, {"loss": 1.0463, "grad_norm": 1.248195767402649, "learning_rate": 0.0002, "epoch": 4.522875816993464, "step": 6920}, {"loss": 1.0347, "grad_norm": 0.8033645749092102, "learning_rate": 0.0002, "epoch": 4.529411764705882, "step": 6930}, {"loss": 1.1068, "grad_norm": 1.7361950874328613, "learning_rate": 0.0002, "epoch": 4.5359477124183005, "step": 6940}, {"loss": 0.9856, "grad_norm": 0.8058095574378967, "learning_rate": 0.0002, "epoch": 4.542483660130719, "step": 6950}, {"loss": 1.0057, "grad_norm": 1.254089593887329, "learning_rate": 0.0002, "epoch": 4.549019607843137, "step": 6960}, {"loss": 1.1723, "grad_norm": 0.9180455803871155, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 6970}, {"loss": 1.0559, "grad_norm": 0.6677682399749756, "learning_rate": 0.0002, "epoch": 4.562091503267974, "step": 6980}, {"loss": 1.0453, "grad_norm": 0.8127354383468628, "learning_rate": 0.0002, "epoch": 4.568627450980392, "step": 6990}, {"loss": 1.0828, "grad_norm": 1.0263001918792725, "learning_rate": 0.0002, "epoch": 4.57516339869281, "step": 7000}, {"loss": 1.0703, "grad_norm": 0.9641909003257751, "learning_rate": 0.0002, "epoch": 4.5816993464052285, "step": 7010}, {"loss": 1.179, "grad_norm": 0.9440861344337463, "learning_rate": 0.0002, "epoch": 4.588235294117647, "step": 7020}, {"loss": 1.0931, "grad_norm": 0.9539011716842651, "learning_rate": 0.0002, "epoch": 4.594771241830065, "step": 7030}, {"loss": 1.0963, "grad_norm": 1.0449910163879395, "learning_rate": 0.0002, "epoch": 4.601307189542483, "step": 7040}, {"loss": 0.9944, "grad_norm": 0.8766893744468689, "learning_rate": 0.0002, "epoch": 4.607843137254902, "step": 7050}, {"loss": 1.0169, "grad_norm": 0.6983462572097778, "learning_rate": 0.0002, "epoch": 4.61437908496732, "step": 7060}, {"loss": 1.1778, "grad_norm": 0.9505505561828613, "learning_rate": 0.0002, "epoch": 4.620915032679738, "step": 7070}, {"loss": 1.121, "grad_norm": 1.2506657838821411, "learning_rate": 0.0002, "epoch": 4.627450980392156, "step": 7080}, {"loss": 1.1329, "grad_norm": 0.9602801203727722, "learning_rate": 0.0002, "epoch": 4.633986928104575, "step": 7090}, {"loss": 1.1499, "grad_norm": 0.7398977875709534, "learning_rate": 0.0002, "epoch": 4.640522875816993, "step": 7100}, {"loss": 1.0769, "grad_norm": 1.3862425088882446, "learning_rate": 0.0002, "epoch": 4.647058823529412, "step": 7110}, {"loss": 1.0571, "grad_norm": 1.1451990604400635, "learning_rate": 0.0002, "epoch": 4.65359477124183, "step": 7120}, {"loss": 1.1271, "grad_norm": 0.9010422229766846, "learning_rate": 0.0002, "epoch": 4.660130718954249, "step": 7130}, {"loss": 1.0165, "grad_norm": 0.7102518081665039, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 7140}, {"loss": 1.0819, "grad_norm": 0.7963796257972717, "learning_rate": 0.0002, "epoch": 4.673202614379085, "step": 7150}, {"loss": 1.1114, "grad_norm": 0.7726007699966431, "learning_rate": 0.0002, "epoch": 4.6797385620915035, "step": 7160}, {"loss": 1.2088, "grad_norm": 0.8097564578056335, "learning_rate": 0.0002, "epoch": 4.686274509803922, "step": 7170}, {"loss": 1.1386, "grad_norm": 0.9070925116539001, "learning_rate": 0.0002, "epoch": 4.69281045751634, "step": 7180}, {"loss": 1.0315, "grad_norm": 0.7543528079986572, "learning_rate": 0.0002, "epoch": 4.699346405228758, "step": 7190}, {"loss": 1.0984, "grad_norm": 0.9900904893875122, "learning_rate": 0.0002, "epoch": 4.705882352941177, "step": 7200}, {"loss": 1.1552, "grad_norm": 0.8033412098884583, "learning_rate": 0.0002, "epoch": 4.712418300653595, "step": 7210}, {"loss": 1.1773, "grad_norm": 0.8440839052200317, "learning_rate": 0.0002, "epoch": 4.718954248366013, "step": 7220}, {"loss": 1.1258, "grad_norm": 0.9325555562973022, "learning_rate": 0.0002, "epoch": 4.7254901960784315, "step": 7230}, {"loss": 1.1384, "grad_norm": 0.7881146669387817, "learning_rate": 0.0002, "epoch": 4.73202614379085, "step": 7240}, {"loss": 1.1219, "grad_norm": 0.884453296661377, "learning_rate": 0.0002, "epoch": 4.738562091503268, "step": 7250}, {"loss": 1.1036, "grad_norm": 0.9274539351463318, "learning_rate": 0.0002, "epoch": 4.745098039215686, "step": 7260}, {"loss": 1.0906, "grad_norm": 1.2367479801177979, "learning_rate": 0.0002, "epoch": 4.751633986928105, "step": 7270}, {"loss": 1.0741, "grad_norm": 0.9499821066856384, "learning_rate": 0.0002, "epoch": 4.758169934640523, "step": 7280}, {"loss": 1.1625, "grad_norm": 2.1918580532073975, "learning_rate": 0.0002, "epoch": 4.764705882352941, "step": 7290}, {"loss": 0.954, "grad_norm": 0.8221880793571472, "learning_rate": 0.0002, "epoch": 4.771241830065359, "step": 7300}, {"loss": 1.1358, "grad_norm": 0.871972918510437, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 7310}, {"loss": 1.0599, "grad_norm": 0.8034510612487793, "learning_rate": 0.0002, "epoch": 4.784313725490196, "step": 7320}, {"loss": 1.1059, "grad_norm": 0.8959605693817139, "learning_rate": 0.0002, "epoch": 4.790849673202614, "step": 7330}, {"loss": 1.0176, "grad_norm": 1.2326215505599976, "learning_rate": 0.0002, "epoch": 4.7973856209150325, "step": 7340}, {"loss": 1.1095, "grad_norm": 0.9725791811943054, "learning_rate": 0.0002, "epoch": 4.803921568627451, "step": 7350}, {"loss": 1.1229, "grad_norm": 0.7240816354751587, "learning_rate": 0.0002, "epoch": 4.810457516339869, "step": 7360}, {"loss": 1.0669, "grad_norm": 0.8265769481658936, "learning_rate": 0.0002, "epoch": 4.816993464052287, "step": 7370}, {"loss": 1.042, "grad_norm": 0.8888696432113647, "learning_rate": 0.0002, "epoch": 4.823529411764706, "step": 7380}, {"loss": 1.0981, "grad_norm": 0.7776556015014648, "learning_rate": 0.0002, "epoch": 4.830065359477124, "step": 7390}, {"loss": 1.0819, "grad_norm": 0.8772371411323547, "learning_rate": 0.0002, "epoch": 4.836601307189542, "step": 7400}, {"loss": 1.0819, "grad_norm": 0.9786531925201416, "learning_rate": 0.0002, "epoch": 4.8431372549019605, "step": 7410}, {"loss": 1.1358, "grad_norm": 0.9059745073318481, "learning_rate": 0.0002, "epoch": 4.849673202614379, "step": 7420}, {"loss": 1.0324, "grad_norm": 0.7422552108764648, "learning_rate": 0.0002, "epoch": 4.856209150326797, "step": 7430}, {"loss": 1.0423, "grad_norm": 1.3040380477905273, "learning_rate": 0.0002, "epoch": 4.862745098039216, "step": 7440}, {"loss": 1.1161, "grad_norm": 1.3278473615646362, "learning_rate": 0.0002, "epoch": 4.8692810457516345, "step": 7450}, {"loss": 1.0713, "grad_norm": 1.2705849409103394, "learning_rate": 0.0002, "epoch": 4.875816993464053, "step": 7460}, {"loss": 1.0034, "grad_norm": 0.8837892413139343, "learning_rate": 0.0002, "epoch": 4.882352941176471, "step": 7470}, {"loss": 1.1716, "grad_norm": 0.8670691251754761, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 7480}, {"loss": 1.1723, "grad_norm": 0.9662758111953735, "learning_rate": 0.0002, "epoch": 4.895424836601308, "step": 7490}, {"loss": 1.1056, "grad_norm": 0.8188302516937256, "learning_rate": 0.0002, "epoch": 4.901960784313726, "step": 7500}, {"loss": 1.0419, "grad_norm": 0.769442617893219, "learning_rate": 0.0002, "epoch": 4.908496732026144, "step": 7510}, {"loss": 1.1671, "grad_norm": 1.1465084552764893, "learning_rate": 0.0002, "epoch": 4.915032679738562, "step": 7520}, {"loss": 1.0768, "grad_norm": 1.253214955329895, "learning_rate": 0.0002, "epoch": 4.921568627450981, "step": 7530}, {"loss": 1.011, "grad_norm": 0.7922375202178955, "learning_rate": 0.0002, "epoch": 4.928104575163399, "step": 7540}, {"loss": 1.1256, "grad_norm": 0.8306851387023926, "learning_rate": 0.0002, "epoch": 4.934640522875817, "step": 7550}, {"loss": 1.206, "grad_norm": 0.8486151099205017, "learning_rate": 0.0002, "epoch": 4.9411764705882355, "step": 7560}, {"loss": 1.0161, "grad_norm": 1.2601467370986938, "learning_rate": 0.0002, "epoch": 4.947712418300654, "step": 7570}, {"loss": 1.1078, "grad_norm": 0.7980747818946838, "learning_rate": 0.0002, "epoch": 4.954248366013072, "step": 7580}, {"loss": 1.0607, "grad_norm": 0.8653254508972168, "learning_rate": 0.0002, "epoch": 4.96078431372549, "step": 7590}, {"loss": 1.0292, "grad_norm": 0.9680571556091309, "learning_rate": 0.0002, "epoch": 4.967320261437909, "step": 7600}, {"loss": 1.1795, "grad_norm": 0.9554466605186462, "learning_rate": 0.0002, "epoch": 4.973856209150327, "step": 7610}, {"loss": 1.0935, "grad_norm": 1.3693897724151611, "learning_rate": 0.0002, "epoch": 4.980392156862745, "step": 7620}, {"loss": 1.0838, "grad_norm": 0.7809282541275024, "learning_rate": 0.0002, "epoch": 4.9869281045751634, "step": 7630}, {"loss": 1.0844, "grad_norm": 0.7528006434440613, "learning_rate": 0.0002, "epoch": 4.993464052287582, "step": 7640}, {"loss": 0.9951, "grad_norm": 1.7491309642791748, "learning_rate": 0.0002, "epoch": 5.0, "step": 7650}, {"eval_loss": 1.4197258949279785, "eval_runtime": 33.6327, "eval_samples_per_second": 12.964, "eval_steps_per_second": 1.635, "epoch": 5.0, "step": 7650}, {"loss": 0.9744, "grad_norm": 0.8840063214302063, "learning_rate": 0.0002, "epoch": 5.006535947712418, "step": 7660}, {"loss": 1.0274, "grad_norm": 1.0118401050567627, "learning_rate": 0.0002, "epoch": 5.0130718954248366, "step": 7670}, {"loss": 1.1667, "grad_norm": 1.0040518045425415, "learning_rate": 0.0002, "epoch": 5.019607843137255, "step": 7680}, {"loss": 0.9426, "grad_norm": 0.7541199922561646, "learning_rate": 0.0002, "epoch": 5.026143790849673, "step": 7690}, {"loss": 1.0797, "grad_norm": 0.9106482863426208, "learning_rate": 0.0002, "epoch": 5.032679738562091, "step": 7700}, {"loss": 1.0096, "grad_norm": 1.3691469430923462, "learning_rate": 0.0002, "epoch": 5.03921568627451, "step": 7710}, {"loss": 0.9889, "grad_norm": 0.9449689388275146, "learning_rate": 0.0002, "epoch": 5.045751633986928, "step": 7720}, {"loss": 0.9087, "grad_norm": 1.1678508520126343, "learning_rate": 0.0002, "epoch": 5.052287581699346, "step": 7730}, {"loss": 1.0556, "grad_norm": 1.1296145915985107, "learning_rate": 0.0002, "epoch": 5.0588235294117645, "step": 7740}, {"loss": 0.9339, "grad_norm": 0.7863904237747192, "learning_rate": 0.0002, "epoch": 5.065359477124183, "step": 7750}, {"loss": 1.0135, "grad_norm": 0.8691433072090149, "learning_rate": 0.0002, "epoch": 5.071895424836601, "step": 7760}, {"loss": 0.9776, "grad_norm": 1.0722088813781738, "learning_rate": 0.0002, "epoch": 5.078431372549019, "step": 7770}, {"loss": 1.0595, "grad_norm": 0.9625038504600525, "learning_rate": 0.0002, "epoch": 5.084967320261438, "step": 7780}, {"loss": 1.0241, "grad_norm": 1.2618783712387085, "learning_rate": 0.0002, "epoch": 5.091503267973856, "step": 7790}, {"loss": 0.9396, "grad_norm": 0.9970650672912598, "learning_rate": 0.0002, "epoch": 5.098039215686274, "step": 7800}, {"loss": 0.9186, "grad_norm": 1.3946677446365356, "learning_rate": 0.0002, "epoch": 5.104575163398692, "step": 7810}, {"loss": 0.9957, "grad_norm": 1.0260052680969238, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 7820}, {"loss": 0.9865, "grad_norm": 1.105521559715271, "learning_rate": 0.0002, "epoch": 5.117647058823529, "step": 7830}, {"loss": 0.9788, "grad_norm": 1.003641128540039, "learning_rate": 0.0002, "epoch": 5.124183006535947, "step": 7840}, {"loss": 0.9688, "grad_norm": 1.0315021276474, "learning_rate": 0.0002, "epoch": 5.130718954248366, "step": 7850}, {"loss": 1.0001, "grad_norm": 0.9469530582427979, "learning_rate": 0.0002, "epoch": 5.137254901960785, "step": 7860}, {"loss": 0.9659, "grad_norm": 1.3244667053222656, "learning_rate": 0.0002, "epoch": 5.143790849673203, "step": 7870}, {"loss": 0.9657, "grad_norm": 1.1732033491134644, "learning_rate": 0.0002, "epoch": 5.150326797385621, "step": 7880}, {"loss": 0.9978, "grad_norm": 1.3129149675369263, "learning_rate": 0.0002, "epoch": 5.1568627450980395, "step": 7890}, {"loss": 0.9894, "grad_norm": 0.8589454293251038, "learning_rate": 0.0002, "epoch": 5.163398692810458, "step": 7900}, {"loss": 1.0161, "grad_norm": 0.8954233527183533, "learning_rate": 0.0002, "epoch": 5.169934640522876, "step": 7910}, {"loss": 0.8741, "grad_norm": 0.7426522970199585, "learning_rate": 0.0002, "epoch": 5.176470588235294, "step": 7920}, {"loss": 1.0106, "grad_norm": 1.1990121603012085, "learning_rate": 0.0002, "epoch": 5.183006535947713, "step": 7930}, {"loss": 0.9453, "grad_norm": 0.8867580890655518, "learning_rate": 0.0002, "epoch": 5.189542483660131, "step": 7940}, {"loss": 0.9727, "grad_norm": 1.016276478767395, "learning_rate": 0.0002, "epoch": 5.196078431372549, "step": 7950}, {"loss": 0.9908, "grad_norm": 1.0210685729980469, "learning_rate": 0.0002, "epoch": 5.2026143790849675, "step": 7960}, {"loss": 1.0522, "grad_norm": 1.0093122720718384, "learning_rate": 0.0002, "epoch": 5.209150326797386, "step": 7970}, {"loss": 1.0055, "grad_norm": 0.9746801853179932, "learning_rate": 0.0002, "epoch": 5.215686274509804, "step": 7980}, {"loss": 1.0611, "grad_norm": 0.9113537073135376, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 7990}, {"loss": 0.9167, "grad_norm": 1.2782206535339355, "learning_rate": 0.0002, "epoch": 5.228758169934641, "step": 8000}, {"loss": 1.0212, "grad_norm": 1.3223118782043457, "learning_rate": 0.0002, "epoch": 5.235294117647059, "step": 8010}, {"loss": 0.9244, "grad_norm": 0.7898629307746887, "learning_rate": 0.0002, "epoch": 5.241830065359477, "step": 8020}, {"loss": 1.0574, "grad_norm": 0.9822350740432739, "learning_rate": 0.0002, "epoch": 5.248366013071895, "step": 8030}, {"loss": 1.0102, "grad_norm": 1.5114340782165527, "learning_rate": 0.0002, "epoch": 5.254901960784314, "step": 8040}, {"loss": 0.9816, "grad_norm": 0.859006941318512, "learning_rate": 0.0002, "epoch": 5.261437908496732, "step": 8050}, {"loss": 0.9445, "grad_norm": 1.0495043992996216, "learning_rate": 0.0002, "epoch": 5.26797385620915, "step": 8060}, {"loss": 0.9724, "grad_norm": 1.329483151435852, "learning_rate": 0.0002, "epoch": 5.2745098039215685, "step": 8070}, {"loss": 0.9296, "grad_norm": 1.1333061456680298, "learning_rate": 0.0002, "epoch": 5.281045751633987, "step": 8080}, {"loss": 0.9577, "grad_norm": 0.8153108358383179, "learning_rate": 0.0002, "epoch": 5.287581699346405, "step": 8090}, {"loss": 0.9002, "grad_norm": 0.9395004510879517, "learning_rate": 0.0002, "epoch": 5.294117647058823, "step": 8100}, {"loss": 1.0371, "grad_norm": 0.8907593488693237, "learning_rate": 0.0002, "epoch": 5.300653594771242, "step": 8110}, {"loss": 0.9301, "grad_norm": 0.9808667898178101, "learning_rate": 0.0002, "epoch": 5.30718954248366, "step": 8120}, {"loss": 1.0136, "grad_norm": 0.984779417514801, "learning_rate": 0.0002, "epoch": 5.313725490196078, "step": 8130}, {"loss": 0.9621, "grad_norm": 0.9787270426750183, "learning_rate": 0.0002, "epoch": 5.3202614379084965, "step": 8140}, {"loss": 0.9336, "grad_norm": 0.9857710599899292, "learning_rate": 0.0002, "epoch": 5.326797385620915, "step": 8150}, {"loss": 0.9884, "grad_norm": 0.9774303436279297, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 8160}, {"loss": 1.0561, "grad_norm": 0.677925169467926, "learning_rate": 0.0002, "epoch": 5.339869281045751, "step": 8170}, {"loss": 1.1345, "grad_norm": 0.9576456546783447, "learning_rate": 0.0002, "epoch": 5.34640522875817, "step": 8180}, {"loss": 0.9554, "grad_norm": 1.8970937728881836, "learning_rate": 0.0002, "epoch": 5.352941176470588, "step": 8190}, {"loss": 1.0474, "grad_norm": 0.9458389282226562, "learning_rate": 0.0002, "epoch": 5.359477124183006, "step": 8200}, {"loss": 1.0365, "grad_norm": 1.761794924736023, "learning_rate": 0.0002, "epoch": 5.366013071895424, "step": 8210}, {"loss": 0.9426, "grad_norm": 1.0693724155426025, "learning_rate": 0.0002, "epoch": 5.372549019607844, "step": 8220}, {"loss": 1.0299, "grad_norm": 0.9025877714157104, "learning_rate": 0.0002, "epoch": 5.379084967320262, "step": 8230}, {"loss": 0.9652, "grad_norm": 1.258857250213623, "learning_rate": 0.0002, "epoch": 5.38562091503268, "step": 8240}, {"loss": 0.9735, "grad_norm": 1.084849238395691, "learning_rate": 0.0002, "epoch": 5.392156862745098, "step": 8250}, {"loss": 0.9999, "grad_norm": 0.9530340433120728, "learning_rate": 0.0002, "epoch": 5.398692810457517, "step": 8260}, {"loss": 1.0268, "grad_norm": 0.830240786075592, "learning_rate": 0.0002, "epoch": 5.405228758169935, "step": 8270}, {"loss": 1.0332, "grad_norm": 1.5807015895843506, "learning_rate": 0.0002, "epoch": 5.411764705882353, "step": 8280}, {"loss": 0.9146, "grad_norm": 0.9486905336380005, "learning_rate": 0.0002, "epoch": 5.4183006535947715, "step": 8290}, {"loss": 1.0336, "grad_norm": 1.0415093898773193, "learning_rate": 0.0002, "epoch": 5.42483660130719, "step": 8300}, {"loss": 0.8933, "grad_norm": 1.0501102209091187, "learning_rate": 0.0002, "epoch": 5.431372549019608, "step": 8310}, {"loss": 0.9983, "grad_norm": 0.9751836061477661, "learning_rate": 0.0002, "epoch": 5.437908496732026, "step": 8320}, {"loss": 1.0755, "grad_norm": 1.5529173612594604, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 8330}, {"loss": 0.9814, "grad_norm": 0.8314350247383118, "learning_rate": 0.0002, "epoch": 5.450980392156863, "step": 8340}, {"loss": 1.0596, "grad_norm": 1.2555103302001953, "learning_rate": 0.0002, "epoch": 5.457516339869281, "step": 8350}, {"loss": 1.0127, "grad_norm": 0.9408367872238159, "learning_rate": 0.0002, "epoch": 5.4640522875816995, "step": 8360}, {"loss": 0.9241, "grad_norm": 0.9483312964439392, "learning_rate": 0.0002, "epoch": 5.470588235294118, "step": 8370}, {"loss": 0.9678, "grad_norm": 0.957905650138855, "learning_rate": 0.0002, "epoch": 5.477124183006536, "step": 8380}, {"loss": 1.0985, "grad_norm": 1.4000147581100464, "learning_rate": 0.0002, "epoch": 5.483660130718954, "step": 8390}, {"loss": 0.9966, "grad_norm": 1.7032461166381836, "learning_rate": 0.0002, "epoch": 5.490196078431373, "step": 8400}, {"loss": 0.9539, "grad_norm": 0.8978716731071472, "learning_rate": 0.0002, "epoch": 5.496732026143791, "step": 8410}, {"loss": 0.9544, "grad_norm": 0.8659300804138184, "learning_rate": 0.0002, "epoch": 5.503267973856209, "step": 8420}, {"loss": 1.0526, "grad_norm": 1.3629727363586426, "learning_rate": 0.0002, "epoch": 5.509803921568627, "step": 8430}, {"loss": 0.9696, "grad_norm": 1.2741984128952026, "learning_rate": 0.0002, "epoch": 5.516339869281046, "step": 8440}, {"loss": 1.0191, "grad_norm": 1.3867180347442627, "learning_rate": 0.0002, "epoch": 5.522875816993464, "step": 8450}, {"loss": 1.0835, "grad_norm": 1.0662001371383667, "learning_rate": 0.0002, "epoch": 5.529411764705882, "step": 8460}, {"loss": 0.9779, "grad_norm": 1.7005380392074585, "learning_rate": 0.0002, "epoch": 5.5359477124183005, "step": 8470}, {"loss": 1.0221, "grad_norm": 1.3730385303497314, "learning_rate": 0.0002, "epoch": 5.542483660130719, "step": 8480}, {"loss": 0.9586, "grad_norm": 1.7737441062927246, "learning_rate": 0.0002, "epoch": 5.549019607843137, "step": 8490}, {"loss": 0.9729, "grad_norm": 0.907487690448761, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 8500}, {"loss": 0.9891, "grad_norm": 0.8882441520690918, "learning_rate": 0.0002, "epoch": 5.562091503267974, "step": 8510}, {"loss": 0.973, "grad_norm": 0.8655388951301575, "learning_rate": 0.0002, "epoch": 5.568627450980392, "step": 8520}, {"loss": 0.9523, "grad_norm": 1.379992961883545, "learning_rate": 0.0002, "epoch": 5.57516339869281, "step": 8530}, {"loss": 1.0174, "grad_norm": 1.0021201372146606, "learning_rate": 0.0002, "epoch": 5.5816993464052285, "step": 8540}, {"loss": 1.0113, "grad_norm": 1.2636926174163818, "learning_rate": 0.0002, "epoch": 5.588235294117647, "step": 8550}, {"loss": 1.0243, "grad_norm": 1.279025912284851, "learning_rate": 0.0002, "epoch": 5.594771241830065, "step": 8560}, {"loss": 0.9917, "grad_norm": 0.8885834217071533, "learning_rate": 0.0002, "epoch": 5.601307189542483, "step": 8570}, {"loss": 0.9849, "grad_norm": 1.1975032091140747, "learning_rate": 0.0002, "epoch": 5.607843137254902, "step": 8580}, {"loss": 1.0363, "grad_norm": 1.005470871925354, "learning_rate": 0.0002, "epoch": 5.61437908496732, "step": 8590}, {"loss": 0.9947, "grad_norm": 1.104286551475525, "learning_rate": 0.0002, "epoch": 5.620915032679738, "step": 8600}, {"loss": 1.0585, "grad_norm": 1.435445785522461, "learning_rate": 0.0002, "epoch": 5.627450980392156, "step": 8610}, {"loss": 0.9156, "grad_norm": 1.0270172357559204, "learning_rate": 0.0002, "epoch": 5.633986928104575, "step": 8620}, {"loss": 1.0522, "grad_norm": 1.0929527282714844, "learning_rate": 0.0002, "epoch": 5.640522875816993, "step": 8630}, {"loss": 0.9694, "grad_norm": 1.1061221361160278, "learning_rate": 0.0002, "epoch": 5.647058823529412, "step": 8640}, {"loss": 1.0826, "grad_norm": 0.9563149213790894, "learning_rate": 0.0002, "epoch": 5.65359477124183, "step": 8650}, {"loss": 1.0042, "grad_norm": 1.0434954166412354, "learning_rate": 0.0002, "epoch": 5.660130718954249, "step": 8660}, {"loss": 0.9463, "grad_norm": 1.3695117235183716, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 8670}, {"loss": 0.9441, "grad_norm": 1.0540564060211182, "learning_rate": 0.0002, "epoch": 5.673202614379085, "step": 8680}, {"loss": 0.9755, "grad_norm": 1.5942492485046387, "learning_rate": 0.0002, "epoch": 5.6797385620915035, "step": 8690}, {"loss": 1.0071, "grad_norm": 0.9485495090484619, "learning_rate": 0.0002, "epoch": 5.686274509803922, "step": 8700}, {"loss": 0.9998, "grad_norm": 1.1483162641525269, "learning_rate": 0.0002, "epoch": 5.69281045751634, "step": 8710}, {"loss": 0.9578, "grad_norm": 0.9075471758842468, "learning_rate": 0.0002, "epoch": 5.699346405228758, "step": 8720}, {"loss": 0.9488, "grad_norm": 1.7908551692962646, "learning_rate": 0.0002, "epoch": 5.705882352941177, "step": 8730}, {"loss": 1.0163, "grad_norm": 0.8867162466049194, "learning_rate": 0.0002, "epoch": 5.712418300653595, "step": 8740}, {"loss": 1.0041, "grad_norm": 1.7165148258209229, "learning_rate": 0.0002, "epoch": 5.718954248366013, "step": 8750}, {"loss": 1.1061, "grad_norm": 0.9529356956481934, "learning_rate": 0.0002, "epoch": 5.7254901960784315, "step": 8760}, {"loss": 1.1119, "grad_norm": 1.01852548122406, "learning_rate": 0.0002, "epoch": 5.73202614379085, "step": 8770}, {"loss": 1.0471, "grad_norm": 0.9538423418998718, "learning_rate": 0.0002, "epoch": 5.738562091503268, "step": 8780}, {"loss": 1.0913, "grad_norm": 0.9007737636566162, "learning_rate": 0.0002, "epoch": 5.745098039215686, "step": 8790}, {"loss": 0.9766, "grad_norm": 0.9107874035835266, "learning_rate": 0.0002, "epoch": 5.751633986928105, "step": 8800}, {"loss": 0.9212, "grad_norm": 0.7379238605499268, "learning_rate": 0.0002, "epoch": 5.758169934640523, "step": 8810}, {"loss": 1.0966, "grad_norm": 1.072645902633667, "learning_rate": 0.0002, "epoch": 5.764705882352941, "step": 8820}, {"loss": 1.0845, "grad_norm": 1.002008080482483, "learning_rate": 0.0002, "epoch": 5.771241830065359, "step": 8830}, {"loss": 0.9978, "grad_norm": 1.0435924530029297, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 8840}, {"loss": 0.9458, "grad_norm": 0.9874551296234131, "learning_rate": 0.0002, "epoch": 5.784313725490196, "step": 8850}, {"loss": 1.1241, "grad_norm": 1.1729662418365479, "learning_rate": 0.0002, "epoch": 5.790849673202614, "step": 8860}, {"loss": 1.0451, "grad_norm": 1.3300775289535522, "learning_rate": 0.0002, "epoch": 5.7973856209150325, "step": 8870}, {"loss": 1.0989, "grad_norm": 1.612707257270813, "learning_rate": 0.0002, "epoch": 5.803921568627451, "step": 8880}, {"loss": 0.9119, "grad_norm": 0.9047797322273254, "learning_rate": 0.0002, "epoch": 5.810457516339869, "step": 8890}, {"loss": 0.989, "grad_norm": 1.0958741903305054, "learning_rate": 0.0002, "epoch": 5.816993464052287, "step": 8900}, {"loss": 1.1922, "grad_norm": 1.0099612474441528, "learning_rate": 0.0002, "epoch": 5.823529411764706, "step": 8910}, {"loss": 1.0623, "grad_norm": 0.8442328572273254, "learning_rate": 0.0002, "epoch": 5.830065359477124, "step": 8920}, {"loss": 0.9134, "grad_norm": 1.1388301849365234, "learning_rate": 0.0002, "epoch": 5.836601307189542, "step": 8930}, {"loss": 1.0019, "grad_norm": 0.8296026587486267, "learning_rate": 0.0002, "epoch": 5.8431372549019605, "step": 8940}, {"loss": 1.0363, "grad_norm": 1.0843533277511597, "learning_rate": 0.0002, "epoch": 5.849673202614379, "step": 8950}, {"loss": 1.0009, "grad_norm": 0.8496834635734558, "learning_rate": 0.0002, "epoch": 5.856209150326797, "step": 8960}, {"loss": 0.9927, "grad_norm": 1.6894690990447998, "learning_rate": 0.0002, "epoch": 5.862745098039216, "step": 8970}, {"loss": 1.0939, "grad_norm": 1.0012282133102417, "learning_rate": 0.0002, "epoch": 5.8692810457516345, "step": 8980}, {"loss": 0.9722, "grad_norm": 0.8521103262901306, "learning_rate": 0.0002, "epoch": 5.875816993464053, "step": 8990}, {"loss": 1.0885, "grad_norm": 1.246841311454773, "learning_rate": 0.0002, "epoch": 5.882352941176471, "step": 9000}, {"loss": 0.9702, "grad_norm": 0.9941892027854919, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 9010}, {"loss": 0.8754, "grad_norm": 1.067413568496704, "learning_rate": 0.0002, "epoch": 5.895424836601308, "step": 9020}, {"loss": 1.0153, "grad_norm": 1.0045088529586792, "learning_rate": 0.0002, "epoch": 5.901960784313726, "step": 9030}, {"loss": 1.0134, "grad_norm": 1.383063554763794, "learning_rate": 0.0002, "epoch": 5.908496732026144, "step": 9040}, {"loss": 1.0845, "grad_norm": 0.8754428625106812, "learning_rate": 0.0002, "epoch": 5.915032679738562, "step": 9050}, {"loss": 0.9571, "grad_norm": 0.8577388525009155, "learning_rate": 0.0002, "epoch": 5.921568627450981, "step": 9060}, {"loss": 1.0532, "grad_norm": 0.8718975186347961, "learning_rate": 0.0002, "epoch": 5.928104575163399, "step": 9070}, {"loss": 1.0667, "grad_norm": 1.1762131452560425, "learning_rate": 0.0002, "epoch": 5.934640522875817, "step": 9080}, {"loss": 1.1114, "grad_norm": 1.1025866270065308, "learning_rate": 0.0002, "epoch": 5.9411764705882355, "step": 9090}, {"loss": 0.9155, "grad_norm": 1.0439870357513428, "learning_rate": 0.0002, "epoch": 5.947712418300654, "step": 9100}, {"loss": 1.0055, "grad_norm": 1.2411525249481201, "learning_rate": 0.0002, "epoch": 5.954248366013072, "step": 9110}, {"loss": 0.9747, "grad_norm": 1.0317714214324951, "learning_rate": 0.0002, "epoch": 5.96078431372549, "step": 9120}, {"loss": 1.0352, "grad_norm": 0.9880492091178894, "learning_rate": 0.0002, "epoch": 5.967320261437909, "step": 9130}, {"loss": 1.0459, "grad_norm": 0.9039815664291382, "learning_rate": 0.0002, "epoch": 5.973856209150327, "step": 9140}, {"loss": 1.0413, "grad_norm": 0.9049116373062134, "learning_rate": 0.0002, "epoch": 5.980392156862745, "step": 9150}, {"loss": 0.9792, "grad_norm": 0.996749222278595, "learning_rate": 0.0002, "epoch": 5.9869281045751634, "step": 9160}, {"loss": 0.8857, "grad_norm": 0.8716062307357788, "learning_rate": 0.0002, "epoch": 5.993464052287582, "step": 9170}, {"loss": 1.019, "grad_norm": 1.3081294298171997, "learning_rate": 0.0002, "epoch": 6.0, "step": 9180}]} +{"epoch": 7.0, "step": 10710, "epoch_duration": 1677.8897953033447, "total_accumulated_duration": 11969.50663638115, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}, {"eval_loss": 1.4131312370300293, "eval_runtime": 31.8967, "eval_samples_per_second": 13.669, "eval_steps_per_second": 1.724, "epoch": 3.0, "step": 4590}, {"loss": 1.1283, "grad_norm": 0.6926420331001282, "learning_rate": 0.0002, "epoch": 3.0065359477124183, "step": 4600}, {"loss": 1.1537, "grad_norm": 0.6656355857849121, "learning_rate": 0.0002, "epoch": 3.0130718954248366, "step": 4610}, {"loss": 1.308, "grad_norm": 0.9901936650276184, "learning_rate": 0.0002, "epoch": 3.019607843137255, "step": 4620}, {"loss": 1.22, "grad_norm": 0.6713474988937378, "learning_rate": 0.0002, "epoch": 3.026143790849673, "step": 4630}, {"loss": 1.2249, "grad_norm": 0.6199324131011963, "learning_rate": 0.0002, "epoch": 3.0326797385620914, "step": 4640}, {"loss": 1.242, "grad_norm": 0.7180785536766052, "learning_rate": 0.0002, "epoch": 3.0392156862745097, "step": 4650}, {"loss": 1.1349, "grad_norm": 0.8256588578224182, "learning_rate": 0.0002, "epoch": 3.045751633986928, "step": 4660}, {"loss": 1.1431, "grad_norm": 0.6637389063835144, "learning_rate": 0.0002, "epoch": 3.052287581699346, "step": 4670}, {"loss": 1.1096, "grad_norm": 0.6980698108673096, "learning_rate": 0.0002, "epoch": 3.0588235294117645, "step": 4680}, {"loss": 1.196, "grad_norm": 0.8091534972190857, "learning_rate": 0.0002, "epoch": 3.065359477124183, "step": 4690}, {"loss": 1.1652, "grad_norm": 0.5715174078941345, "learning_rate": 0.0002, "epoch": 3.0718954248366015, "step": 4700}, {"loss": 1.1427, "grad_norm": 0.735639750957489, "learning_rate": 0.0002, "epoch": 3.0784313725490198, "step": 4710}, {"loss": 1.1522, "grad_norm": 0.7619708180427551, "learning_rate": 0.0002, "epoch": 3.084967320261438, "step": 4720}, {"loss": 1.0853, "grad_norm": 1.263566017150879, "learning_rate": 0.0002, "epoch": 3.0915032679738563, "step": 4730}, {"loss": 1.1348, "grad_norm": 0.6600871682167053, "learning_rate": 0.0002, "epoch": 3.0980392156862746, "step": 4740}, {"loss": 1.1766, "grad_norm": 0.717792809009552, "learning_rate": 0.0002, "epoch": 3.104575163398693, "step": 4750}, {"loss": 1.088, "grad_norm": 0.853714644908905, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 4760}, {"loss": 1.2031, "grad_norm": 1.1004153490066528, "learning_rate": 0.0002, "epoch": 3.1176470588235294, "step": 4770}, {"loss": 1.3295, "grad_norm": 0.8566235899925232, "learning_rate": 0.0002, "epoch": 3.1241830065359477, "step": 4780}, {"loss": 1.2436, "grad_norm": 0.8315296173095703, "learning_rate": 0.0002, "epoch": 3.130718954248366, "step": 4790}, {"loss": 1.32, "grad_norm": 0.8020524978637695, "learning_rate": 0.0002, "epoch": 3.1372549019607843, "step": 4800}, {"loss": 1.1238, "grad_norm": 0.7564275860786438, "learning_rate": 0.0002, "epoch": 3.1437908496732025, "step": 4810}, {"loss": 1.1244, "grad_norm": 0.9077776670455933, "learning_rate": 0.0002, "epoch": 3.150326797385621, "step": 4820}, {"loss": 1.1399, "grad_norm": 0.6323099732398987, "learning_rate": 0.0002, "epoch": 3.156862745098039, "step": 4830}, {"loss": 1.1983, "grad_norm": 0.6625368595123291, "learning_rate": 0.0002, "epoch": 3.1633986928104574, "step": 4840}, {"loss": 1.066, "grad_norm": 0.8119261860847473, "learning_rate": 0.0002, "epoch": 3.1699346405228757, "step": 4850}, {"loss": 1.0224, "grad_norm": 0.6399450898170471, "learning_rate": 0.0002, "epoch": 3.176470588235294, "step": 4860}, {"loss": 1.2181, "grad_norm": 1.0659016370773315, "learning_rate": 0.0002, "epoch": 3.183006535947712, "step": 4870}, {"loss": 1.2914, "grad_norm": 0.8040369749069214, "learning_rate": 0.0002, "epoch": 3.189542483660131, "step": 4880}, {"loss": 1.1996, "grad_norm": 0.7784733176231384, "learning_rate": 0.0002, "epoch": 3.196078431372549, "step": 4890}, {"loss": 1.2051, "grad_norm": 0.9660294651985168, "learning_rate": 0.0002, "epoch": 3.2026143790849675, "step": 4900}, {"loss": 1.0419, "grad_norm": 1.0676977634429932, "learning_rate": 0.0002, "epoch": 3.2091503267973858, "step": 4910}, {"loss": 1.0083, "grad_norm": 0.5877565741539001, "learning_rate": 0.0002, "epoch": 3.215686274509804, "step": 4920}, {"loss": 1.1046, "grad_norm": 0.6164032816886902, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 4930}, {"loss": 1.1079, "grad_norm": 0.7627606987953186, "learning_rate": 0.0002, "epoch": 3.2287581699346406, "step": 4940}, {"loss": 1.2453, "grad_norm": 0.7442803978919983, "learning_rate": 0.0002, "epoch": 3.235294117647059, "step": 4950}, {"loss": 1.1087, "grad_norm": 0.7277812361717224, "learning_rate": 0.0002, "epoch": 3.241830065359477, "step": 4960}, {"loss": 1.2237, "grad_norm": 1.0301902294158936, "learning_rate": 0.0002, "epoch": 3.2483660130718954, "step": 4970}, {"loss": 1.1466, "grad_norm": 0.7798232436180115, "learning_rate": 0.0002, "epoch": 3.2549019607843137, "step": 4980}, {"loss": 1.2142, "grad_norm": 1.210265874862671, "learning_rate": 0.0002, "epoch": 3.261437908496732, "step": 4990}, {"loss": 1.1557, "grad_norm": 0.6677713990211487, "learning_rate": 0.0002, "epoch": 3.2679738562091503, "step": 5000}, {"loss": 1.3294, "grad_norm": 1.0524500608444214, "learning_rate": 0.0002, "epoch": 3.2745098039215685, "step": 5010}, {"loss": 1.1939, "grad_norm": 0.7091745734214783, "learning_rate": 0.0002, "epoch": 3.281045751633987, "step": 5020}, {"loss": 1.1891, "grad_norm": 0.8523224592208862, "learning_rate": 0.0002, "epoch": 3.287581699346405, "step": 5030}, {"loss": 1.1925, "grad_norm": 0.6120608448982239, "learning_rate": 0.0002, "epoch": 3.2941176470588234, "step": 5040}, {"loss": 1.0603, "grad_norm": 0.7437472939491272, "learning_rate": 0.0002, "epoch": 3.3006535947712417, "step": 5050}, {"loss": 1.1295, "grad_norm": 0.7611715197563171, "learning_rate": 0.0002, "epoch": 3.30718954248366, "step": 5060}, {"loss": 1.0531, "grad_norm": 0.7249704003334045, "learning_rate": 0.0002, "epoch": 3.313725490196078, "step": 5070}, {"loss": 1.2292, "grad_norm": 0.7316247820854187, "learning_rate": 0.0002, "epoch": 3.3202614379084965, "step": 5080}, {"loss": 1.1974, "grad_norm": 0.562412440776825, "learning_rate": 0.0002, "epoch": 3.326797385620915, "step": 5090}, {"loss": 1.0736, "grad_norm": 0.7052176594734192, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 5100}, {"loss": 1.122, "grad_norm": 0.7714211344718933, "learning_rate": 0.0002, "epoch": 3.3398692810457518, "step": 5110}, {"loss": 1.1684, "grad_norm": 1.0436055660247803, "learning_rate": 0.0002, "epoch": 3.34640522875817, "step": 5120}, {"loss": 1.0945, "grad_norm": 0.8867271542549133, "learning_rate": 0.0002, "epoch": 3.3529411764705883, "step": 5130}, {"loss": 1.159, "grad_norm": 0.8371267914772034, "learning_rate": 0.0002, "epoch": 3.3594771241830066, "step": 5140}, {"loss": 1.1073, "grad_norm": 0.7257837057113647, "learning_rate": 0.0002, "epoch": 3.366013071895425, "step": 5150}, {"loss": 1.1162, "grad_norm": 0.7102002501487732, "learning_rate": 0.0002, "epoch": 3.372549019607843, "step": 5160}, {"loss": 1.2056, "grad_norm": 0.7636350393295288, "learning_rate": 0.0002, "epoch": 3.3790849673202614, "step": 5170}, {"loss": 1.0708, "grad_norm": 0.6887359619140625, "learning_rate": 0.0002, "epoch": 3.3856209150326797, "step": 5180}, {"loss": 1.3807, "grad_norm": 0.8141424655914307, "learning_rate": 0.0002, "epoch": 3.392156862745098, "step": 5190}, {"loss": 1.1986, "grad_norm": 0.694423496723175, "learning_rate": 0.0002, "epoch": 3.3986928104575163, "step": 5200}, {"loss": 1.2945, "grad_norm": 0.914013683795929, "learning_rate": 0.0002, "epoch": 3.4052287581699345, "step": 5210}, {"loss": 1.1413, "grad_norm": 0.8503239750862122, "learning_rate": 0.0002, "epoch": 3.411764705882353, "step": 5220}, {"loss": 1.2696, "grad_norm": 0.6196836233139038, "learning_rate": 0.0002, "epoch": 3.418300653594771, "step": 5230}, {"loss": 1.2431, "grad_norm": 1.0760811567306519, "learning_rate": 0.0002, "epoch": 3.4248366013071894, "step": 5240}, {"loss": 1.1686, "grad_norm": 0.6524698138237, "learning_rate": 0.0002, "epoch": 3.431372549019608, "step": 5250}, {"loss": 1.2012, "grad_norm": 0.674467921257019, "learning_rate": 0.0002, "epoch": 3.4379084967320264, "step": 5260}, {"loss": 1.1015, "grad_norm": 0.7690372467041016, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 5270}, {"loss": 1.2511, "grad_norm": 0.8751813769340515, "learning_rate": 0.0002, "epoch": 3.450980392156863, "step": 5280}, {"loss": 1.1841, "grad_norm": 0.750407874584198, "learning_rate": 0.0002, "epoch": 3.457516339869281, "step": 5290}, {"loss": 1.0605, "grad_norm": 0.5991823077201843, "learning_rate": 0.0002, "epoch": 3.4640522875816995, "step": 5300}, {"loss": 1.2347, "grad_norm": 1.0164772272109985, "learning_rate": 0.0002, "epoch": 3.4705882352941178, "step": 5310}, {"loss": 1.2354, "grad_norm": 0.8704105019569397, "learning_rate": 0.0002, "epoch": 3.477124183006536, "step": 5320}, {"loss": 1.2169, "grad_norm": 0.709102213382721, "learning_rate": 0.0002, "epoch": 3.4836601307189543, "step": 5330}, {"loss": 1.2425, "grad_norm": 0.6273632049560547, "learning_rate": 0.0002, "epoch": 3.4901960784313726, "step": 5340}, {"loss": 1.1585, "grad_norm": 0.6807359457015991, "learning_rate": 0.0002, "epoch": 3.496732026143791, "step": 5350}, {"loss": 1.131, "grad_norm": 0.7085188627243042, "learning_rate": 0.0002, "epoch": 3.503267973856209, "step": 5360}, {"loss": 1.1159, "grad_norm": 0.6938307881355286, "learning_rate": 0.0002, "epoch": 3.5098039215686274, "step": 5370}, {"loss": 1.1397, "grad_norm": 0.8544146418571472, "learning_rate": 0.0002, "epoch": 3.5163398692810457, "step": 5380}, {"loss": 1.2181, "grad_norm": 0.7889642119407654, "learning_rate": 0.0002, "epoch": 3.522875816993464, "step": 5390}, {"loss": 1.1691, "grad_norm": 0.7858421206474304, "learning_rate": 0.0002, "epoch": 3.5294117647058822, "step": 5400}, {"loss": 1.2374, "grad_norm": 0.8547123074531555, "learning_rate": 0.0002, "epoch": 3.5359477124183005, "step": 5410}, {"loss": 1.196, "grad_norm": 0.8218181133270264, "learning_rate": 0.0002, "epoch": 3.542483660130719, "step": 5420}, {"loss": 1.1961, "grad_norm": 1.153623342514038, "learning_rate": 0.0002, "epoch": 3.549019607843137, "step": 5430}, {"loss": 1.156, "grad_norm": 1.1321099996566772, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 5440}, {"loss": 1.2224, "grad_norm": 0.9495334029197693, "learning_rate": 0.0002, "epoch": 3.5620915032679736, "step": 5450}, {"loss": 1.2869, "grad_norm": 0.8743821978569031, "learning_rate": 0.0002, "epoch": 3.568627450980392, "step": 5460}, {"loss": 1.1018, "grad_norm": 0.7513086795806885, "learning_rate": 0.0002, "epoch": 3.57516339869281, "step": 5470}, {"loss": 1.1082, "grad_norm": 1.0139480829238892, "learning_rate": 0.0002, "epoch": 3.581699346405229, "step": 5480}, {"loss": 1.1706, "grad_norm": 0.6615135073661804, "learning_rate": 0.0002, "epoch": 3.588235294117647, "step": 5490}, {"loss": 1.3906, "grad_norm": 1.180798888206482, "learning_rate": 0.0002, "epoch": 3.5947712418300655, "step": 5500}, {"loss": 1.2391, "grad_norm": 0.7085279226303101, "learning_rate": 0.0002, "epoch": 3.6013071895424837, "step": 5510}, {"loss": 1.1623, "grad_norm": 0.540268063545227, "learning_rate": 0.0002, "epoch": 3.607843137254902, "step": 5520}, {"loss": 1.2132, "grad_norm": 0.7905671000480652, "learning_rate": 0.0002, "epoch": 3.6143790849673203, "step": 5530}, {"loss": 1.2731, "grad_norm": 0.8457717299461365, "learning_rate": 0.0002, "epoch": 3.6209150326797386, "step": 5540}, {"loss": 1.1799, "grad_norm": 0.7102677822113037, "learning_rate": 0.0002, "epoch": 3.627450980392157, "step": 5550}, {"loss": 1.2394, "grad_norm": 0.7179514765739441, "learning_rate": 0.0002, "epoch": 3.633986928104575, "step": 5560}, {"loss": 1.2019, "grad_norm": 1.0854148864746094, "learning_rate": 0.0002, "epoch": 3.6405228758169934, "step": 5570}, {"loss": 1.1986, "grad_norm": 0.8209951519966125, "learning_rate": 0.0002, "epoch": 3.6470588235294117, "step": 5580}, {"loss": 1.2289, "grad_norm": 0.6944138407707214, "learning_rate": 0.0002, "epoch": 3.65359477124183, "step": 5590}, {"loss": 1.3226, "grad_norm": 0.7675473093986511, "learning_rate": 0.0002, "epoch": 3.6601307189542482, "step": 5600}, {"loss": 1.2866, "grad_norm": 0.6683364510536194, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 5610}, {"loss": 1.1099, "grad_norm": 0.7920727133750916, "learning_rate": 0.0002, "epoch": 3.6732026143790852, "step": 5620}, {"loss": 1.2287, "grad_norm": 0.9440218806266785, "learning_rate": 0.0002, "epoch": 3.6797385620915035, "step": 5630}, {"loss": 1.2444, "grad_norm": 0.6600824594497681, "learning_rate": 0.0002, "epoch": 3.686274509803922, "step": 5640}, {"loss": 1.191, "grad_norm": 0.6860619187355042, "learning_rate": 0.0002, "epoch": 3.69281045751634, "step": 5650}, {"loss": 1.1914, "grad_norm": 0.6579713225364685, "learning_rate": 0.0002, "epoch": 3.6993464052287583, "step": 5660}, {"loss": 1.1464, "grad_norm": 0.661081075668335, "learning_rate": 0.0002, "epoch": 3.7058823529411766, "step": 5670}, {"loss": 1.289, "grad_norm": 1.0968825817108154, "learning_rate": 0.0002, "epoch": 3.712418300653595, "step": 5680}, {"loss": 1.192, "grad_norm": 0.8066844940185547, "learning_rate": 0.0002, "epoch": 3.718954248366013, "step": 5690}, {"loss": 1.2322, "grad_norm": 0.8341682553291321, "learning_rate": 0.0002, "epoch": 3.7254901960784315, "step": 5700}, {"loss": 1.1473, "grad_norm": 0.6682852506637573, "learning_rate": 0.0002, "epoch": 3.7320261437908497, "step": 5710}, {"loss": 1.1566, "grad_norm": 0.898595929145813, "learning_rate": 0.0002, "epoch": 3.738562091503268, "step": 5720}, {"loss": 1.0919, "grad_norm": 0.6876054406166077, "learning_rate": 0.0002, "epoch": 3.7450980392156863, "step": 5730}, {"loss": 1.2302, "grad_norm": 0.7817103266716003, "learning_rate": 0.0002, "epoch": 3.7516339869281046, "step": 5740}, {"loss": 1.2439, "grad_norm": 0.5840168595314026, "learning_rate": 0.0002, "epoch": 3.758169934640523, "step": 5750}, {"loss": 1.1279, "grad_norm": 0.6263918876647949, "learning_rate": 0.0002, "epoch": 3.764705882352941, "step": 5760}, {"loss": 1.2023, "grad_norm": 0.7948952317237854, "learning_rate": 0.0002, "epoch": 3.7712418300653594, "step": 5770}, {"loss": 1.149, "grad_norm": 0.6700998544692993, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 5780}, {"loss": 1.3207, "grad_norm": 1.1169519424438477, "learning_rate": 0.0002, "epoch": 3.784313725490196, "step": 5790}, {"loss": 1.064, "grad_norm": 0.8354471325874329, "learning_rate": 0.0002, "epoch": 3.7908496732026142, "step": 5800}, {"loss": 1.2104, "grad_norm": 0.6304181814193726, "learning_rate": 0.0002, "epoch": 3.7973856209150325, "step": 5810}, {"loss": 1.2059, "grad_norm": 0.6919655799865723, "learning_rate": 0.0002, "epoch": 3.803921568627451, "step": 5820}, {"loss": 1.217, "grad_norm": 0.600385844707489, "learning_rate": 0.0002, "epoch": 3.810457516339869, "step": 5830}, {"loss": 1.2324, "grad_norm": 0.8406319618225098, "learning_rate": 0.0002, "epoch": 3.8169934640522873, "step": 5840}, {"loss": 1.2418, "grad_norm": 0.7594282031059265, "learning_rate": 0.0002, "epoch": 3.8235294117647056, "step": 5850}, {"loss": 1.1903, "grad_norm": 0.8179879784584045, "learning_rate": 0.0002, "epoch": 3.8300653594771243, "step": 5860}, {"loss": 1.255, "grad_norm": 1.141430377960205, "learning_rate": 0.0002, "epoch": 3.8366013071895426, "step": 5870}, {"loss": 1.1467, "grad_norm": 0.6595550775527954, "learning_rate": 0.0002, "epoch": 3.843137254901961, "step": 5880}, {"loss": 1.2378, "grad_norm": 0.7499435544013977, "learning_rate": 0.0002, "epoch": 3.849673202614379, "step": 5890}, {"loss": 1.217, "grad_norm": 0.7851517200469971, "learning_rate": 0.0002, "epoch": 3.8562091503267975, "step": 5900}, {"loss": 1.162, "grad_norm": 1.0533545017242432, "learning_rate": 0.0002, "epoch": 3.8627450980392157, "step": 5910}, {"loss": 1.3576, "grad_norm": 0.960086464881897, "learning_rate": 0.0002, "epoch": 3.869281045751634, "step": 5920}, {"loss": 1.151, "grad_norm": 0.9952049851417542, "learning_rate": 0.0002, "epoch": 3.8758169934640523, "step": 5930}, {"loss": 1.2027, "grad_norm": 0.7884191274642944, "learning_rate": 0.0002, "epoch": 3.8823529411764706, "step": 5940}, {"loss": 1.1796, "grad_norm": 0.7461766600608826, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 5950}, {"loss": 1.2251, "grad_norm": 0.9594355821609497, "learning_rate": 0.0002, "epoch": 3.895424836601307, "step": 5960}, {"loss": 1.1164, "grad_norm": 0.8179471492767334, "learning_rate": 0.0002, "epoch": 3.9019607843137254, "step": 5970}, {"loss": 1.2421, "grad_norm": 0.8240267634391785, "learning_rate": 0.0002, "epoch": 3.9084967320261437, "step": 5980}, {"loss": 1.3076, "grad_norm": 0.7462618350982666, "learning_rate": 0.0002, "epoch": 3.915032679738562, "step": 5990}, {"loss": 1.2124, "grad_norm": 0.711207389831543, "learning_rate": 0.0002, "epoch": 3.9215686274509802, "step": 6000}, {"loss": 1.2119, "grad_norm": 0.6910956501960754, "learning_rate": 0.0002, "epoch": 3.928104575163399, "step": 6010}, {"loss": 1.2127, "grad_norm": 0.749093770980835, "learning_rate": 0.0002, "epoch": 3.9346405228758172, "step": 6020}, {"loss": 1.1542, "grad_norm": 1.3332762718200684, "learning_rate": 0.0002, "epoch": 3.9411764705882355, "step": 6030}, {"loss": 1.1442, "grad_norm": 0.71457439661026, "learning_rate": 0.0002, "epoch": 3.947712418300654, "step": 6040}, {"loss": 1.339, "grad_norm": 1.1205238103866577, "learning_rate": 0.0002, "epoch": 3.954248366013072, "step": 6050}, {"loss": 1.2962, "grad_norm": 0.6958928108215332, "learning_rate": 0.0002, "epoch": 3.9607843137254903, "step": 6060}, {"loss": 1.1802, "grad_norm": 0.7518056035041809, "learning_rate": 0.0002, "epoch": 3.9673202614379086, "step": 6070}, {"loss": 1.1179, "grad_norm": 0.8010755777359009, "learning_rate": 0.0002, "epoch": 3.973856209150327, "step": 6080}, {"loss": 1.2867, "grad_norm": 0.7492658495903015, "learning_rate": 0.0002, "epoch": 3.980392156862745, "step": 6090}, {"loss": 1.2113, "grad_norm": 0.900704562664032, "learning_rate": 0.0002, "epoch": 3.9869281045751634, "step": 6100}, {"loss": 1.1106, "grad_norm": 0.7997331619262695, "learning_rate": 0.0002, "epoch": 3.9934640522875817, "step": 6110}, {"loss": 1.1244, "grad_norm": 0.7163209319114685, "learning_rate": 0.0002, "epoch": 4.0, "step": 6120}, {"eval_loss": 1.4113320112228394, "eval_runtime": 33.7199, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.631, "epoch": 4.0, "step": 6120}, {"loss": 1.0423, "grad_norm": 0.9527022838592529, "learning_rate": 0.0002, "epoch": 4.006535947712418, "step": 6130}, {"loss": 1.101, "grad_norm": 0.7603210210800171, "learning_rate": 0.0002, "epoch": 4.0130718954248366, "step": 6140}, {"loss": 1.1834, "grad_norm": 1.127387523651123, "learning_rate": 0.0002, "epoch": 4.019607843137255, "step": 6150}, {"loss": 1.0734, "grad_norm": 0.8290133476257324, "learning_rate": 0.0002, "epoch": 4.026143790849673, "step": 6160}, {"loss": 1.0785, "grad_norm": 0.9912241101264954, "learning_rate": 0.0002, "epoch": 4.032679738562091, "step": 6170}, {"loss": 1.0719, "grad_norm": 0.947005033493042, "learning_rate": 0.0002, "epoch": 4.03921568627451, "step": 6180}, {"loss": 1.0835, "grad_norm": 0.707466185092926, "learning_rate": 0.0002, "epoch": 4.045751633986928, "step": 6190}, {"loss": 1.1079, "grad_norm": 1.0604327917099, "learning_rate": 0.0002, "epoch": 4.052287581699346, "step": 6200}, {"loss": 1.0375, "grad_norm": 0.7848685383796692, "learning_rate": 0.0002, "epoch": 4.0588235294117645, "step": 6210}, {"loss": 1.1167, "grad_norm": 0.8475256562232971, "learning_rate": 0.0002, "epoch": 4.065359477124183, "step": 6220}, {"loss": 1.1104, "grad_norm": 0.9759448766708374, "learning_rate": 0.0002, "epoch": 4.071895424836601, "step": 6230}, {"loss": 1.1538, "grad_norm": 0.9324519038200378, "learning_rate": 0.0002, "epoch": 4.078431372549019, "step": 6240}, {"loss": 1.0817, "grad_norm": 0.8723901510238647, "learning_rate": 0.0002, "epoch": 4.084967320261438, "step": 6250}, {"loss": 1.0977, "grad_norm": 0.8343415856361389, "learning_rate": 0.0002, "epoch": 4.091503267973856, "step": 6260}, {"loss": 0.9887, "grad_norm": 0.7490310072898865, "learning_rate": 0.0002, "epoch": 4.098039215686274, "step": 6270}, {"loss": 1.2084, "grad_norm": 0.8961182832717896, "learning_rate": 0.0002, "epoch": 4.104575163398692, "step": 6280}, {"loss": 1.1349, "grad_norm": 0.7124854922294617, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 6290}, {"loss": 1.0081, "grad_norm": 0.8338138461112976, "learning_rate": 0.0002, "epoch": 4.117647058823529, "step": 6300}, {"loss": 1.1091, "grad_norm": 0.8075833320617676, "learning_rate": 0.0002, "epoch": 4.124183006535947, "step": 6310}, {"loss": 1.0193, "grad_norm": 0.8069391846656799, "learning_rate": 0.0002, "epoch": 4.130718954248366, "step": 6320}, {"loss": 0.948, "grad_norm": 0.9567893147468567, "learning_rate": 0.0002, "epoch": 4.137254901960785, "step": 6330}, {"loss": 1.0241, "grad_norm": 1.2184662818908691, "learning_rate": 0.0002, "epoch": 4.143790849673203, "step": 6340}, {"loss": 1.0756, "grad_norm": 1.030976414680481, "learning_rate": 0.0002, "epoch": 4.150326797385621, "step": 6350}, {"loss": 1.1124, "grad_norm": 0.9749957323074341, "learning_rate": 0.0002, "epoch": 4.1568627450980395, "step": 6360}, {"loss": 1.1038, "grad_norm": 0.7089483141899109, "learning_rate": 0.0002, "epoch": 4.163398692810458, "step": 6370}, {"loss": 1.2175, "grad_norm": 1.1084946393966675, "learning_rate": 0.0002, "epoch": 4.169934640522876, "step": 6380}, {"loss": 1.0274, "grad_norm": 0.7998497486114502, "learning_rate": 0.0002, "epoch": 4.176470588235294, "step": 6390}, {"loss": 1.005, "grad_norm": 0.8997811675071716, "learning_rate": 0.0002, "epoch": 4.183006535947713, "step": 6400}, {"loss": 1.0704, "grad_norm": 0.8359479904174805, "learning_rate": 0.0002, "epoch": 4.189542483660131, "step": 6410}, {"loss": 1.1056, "grad_norm": 0.9087472558021545, "learning_rate": 0.0002, "epoch": 4.196078431372549, "step": 6420}, {"loss": 1.0657, "grad_norm": 1.1100451946258545, "learning_rate": 0.0002, "epoch": 4.2026143790849675, "step": 6430}, {"loss": 1.1443, "grad_norm": 0.9376999735832214, "learning_rate": 0.0002, "epoch": 4.209150326797386, "step": 6440}, {"loss": 1.0862, "grad_norm": 0.8179266452789307, "learning_rate": 0.0002, "epoch": 4.215686274509804, "step": 6450}, {"loss": 1.0679, "grad_norm": 0.9953271746635437, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 6460}, {"loss": 1.1034, "grad_norm": 0.8476650714874268, "learning_rate": 0.0002, "epoch": 4.228758169934641, "step": 6470}, {"loss": 1.2512, "grad_norm": 0.8406323194503784, "learning_rate": 0.0002, "epoch": 4.235294117647059, "step": 6480}, {"loss": 1.057, "grad_norm": 0.819134533405304, "learning_rate": 0.0002, "epoch": 4.241830065359477, "step": 6490}, {"loss": 1.1082, "grad_norm": 0.7764983773231506, "learning_rate": 0.0002, "epoch": 4.248366013071895, "step": 6500}, {"loss": 1.1593, "grad_norm": 0.8252112865447998, "learning_rate": 0.0002, "epoch": 4.254901960784314, "step": 6510}, {"loss": 1.1369, "grad_norm": 0.7941019535064697, "learning_rate": 0.0002, "epoch": 4.261437908496732, "step": 6520}, {"loss": 1.0296, "grad_norm": 0.7673905491828918, "learning_rate": 0.0002, "epoch": 4.26797385620915, "step": 6530}, {"loss": 1.1387, "grad_norm": 0.8749890327453613, "learning_rate": 0.0002, "epoch": 4.2745098039215685, "step": 6540}, {"loss": 1.0595, "grad_norm": 0.7343207597732544, "learning_rate": 0.0002, "epoch": 4.281045751633987, "step": 6550}, {"loss": 1.1715, "grad_norm": 1.2786651849746704, "learning_rate": 0.0002, "epoch": 4.287581699346405, "step": 6560}, {"loss": 1.0514, "grad_norm": 1.316875696182251, "learning_rate": 0.0002, "epoch": 4.294117647058823, "step": 6570}, {"loss": 1.1125, "grad_norm": 0.8349189162254333, "learning_rate": 0.0002, "epoch": 4.300653594771242, "step": 6580}, {"loss": 1.0732, "grad_norm": 0.7510647177696228, "learning_rate": 0.0002, "epoch": 4.30718954248366, "step": 6590}, {"loss": 1.1387, "grad_norm": 0.932420551776886, "learning_rate": 0.0002, "epoch": 4.313725490196078, "step": 6600}, {"loss": 1.1115, "grad_norm": 0.8510616421699524, "learning_rate": 0.0002, "epoch": 4.3202614379084965, "step": 6610}, {"loss": 1.0957, "grad_norm": 0.7661547064781189, "learning_rate": 0.0002, "epoch": 4.326797385620915, "step": 6620}, {"loss": 1.2064, "grad_norm": 1.0370930433273315, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 6630}, {"loss": 1.1064, "grad_norm": 0.9302158951759338, "learning_rate": 0.0002, "epoch": 4.339869281045751, "step": 6640}, {"loss": 0.968, "grad_norm": 0.9203811883926392, "learning_rate": 0.0002, "epoch": 4.34640522875817, "step": 6650}, {"loss": 1.0123, "grad_norm": 0.9986332654953003, "learning_rate": 0.0002, "epoch": 4.352941176470588, "step": 6660}, {"loss": 1.1079, "grad_norm": 0.8001713156700134, "learning_rate": 0.0002, "epoch": 4.359477124183006, "step": 6670}, {"loss": 1.0248, "grad_norm": 0.829714298248291, "learning_rate": 0.0002, "epoch": 4.366013071895424, "step": 6680}, {"loss": 1.0389, "grad_norm": 0.8253079056739807, "learning_rate": 0.0002, "epoch": 4.372549019607844, "step": 6690}, {"loss": 1.1087, "grad_norm": 0.824666440486908, "learning_rate": 0.0002, "epoch": 4.379084967320262, "step": 6700}, {"loss": 1.1968, "grad_norm": 0.8872972130775452, "learning_rate": 0.0002, "epoch": 4.38562091503268, "step": 6710}, {"loss": 1.0474, "grad_norm": 0.8729761838912964, "learning_rate": 0.0002, "epoch": 4.392156862745098, "step": 6720}, {"loss": 1.0961, "grad_norm": 1.1367264986038208, "learning_rate": 0.0002, "epoch": 4.398692810457517, "step": 6730}, {"loss": 1.0184, "grad_norm": 0.9699058532714844, "learning_rate": 0.0002, "epoch": 4.405228758169935, "step": 6740}, {"loss": 1.006, "grad_norm": 0.8266763687133789, "learning_rate": 0.0002, "epoch": 4.411764705882353, "step": 6750}, {"loss": 1.0735, "grad_norm": 1.0249767303466797, "learning_rate": 0.0002, "epoch": 4.4183006535947715, "step": 6760}, {"loss": 1.1726, "grad_norm": 0.73606938123703, "learning_rate": 0.0002, "epoch": 4.42483660130719, "step": 6770}, {"loss": 1.1037, "grad_norm": 1.4050679206848145, "learning_rate": 0.0002, "epoch": 4.431372549019608, "step": 6780}, {"loss": 1.1418, "grad_norm": 1.1114081144332886, "learning_rate": 0.0002, "epoch": 4.437908496732026, "step": 6790}, {"loss": 0.9682, "grad_norm": 0.8031067848205566, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6800}, {"loss": 1.0753, "grad_norm": 0.8513566851615906, "learning_rate": 0.0002, "epoch": 4.450980392156863, "step": 6810}, {"loss": 1.1852, "grad_norm": 1.332741379737854, "learning_rate": 0.0002, "epoch": 4.457516339869281, "step": 6820}, {"loss": 1.0966, "grad_norm": 1.5032578706741333, "learning_rate": 0.0002, "epoch": 4.4640522875816995, "step": 6830}, {"loss": 1.1124, "grad_norm": 0.7677283883094788, "learning_rate": 0.0002, "epoch": 4.470588235294118, "step": 6840}, {"loss": 1.1501, "grad_norm": 0.989148736000061, "learning_rate": 0.0002, "epoch": 4.477124183006536, "step": 6850}, {"loss": 1.2239, "grad_norm": 1.5316275358200073, "learning_rate": 0.0002, "epoch": 4.483660130718954, "step": 6860}, {"loss": 1.1171, "grad_norm": 0.9427124261856079, "learning_rate": 0.0002, "epoch": 4.490196078431373, "step": 6870}, {"loss": 1.1314, "grad_norm": 1.215287685394287, "learning_rate": 0.0002, "epoch": 4.496732026143791, "step": 6880}, {"loss": 1.0809, "grad_norm": 0.7286760210990906, "learning_rate": 0.0002, "epoch": 4.503267973856209, "step": 6890}, {"loss": 1.0179, "grad_norm": 0.874829888343811, "learning_rate": 0.0002, "epoch": 4.509803921568627, "step": 6900}, {"loss": 1.0233, "grad_norm": 0.8058359622955322, "learning_rate": 0.0002, "epoch": 4.516339869281046, "step": 6910}, {"loss": 1.0463, "grad_norm": 1.248195767402649, "learning_rate": 0.0002, "epoch": 4.522875816993464, "step": 6920}, {"loss": 1.0347, "grad_norm": 0.8033645749092102, "learning_rate": 0.0002, "epoch": 4.529411764705882, "step": 6930}, {"loss": 1.1068, "grad_norm": 1.7361950874328613, "learning_rate": 0.0002, "epoch": 4.5359477124183005, "step": 6940}, {"loss": 0.9856, "grad_norm": 0.8058095574378967, "learning_rate": 0.0002, "epoch": 4.542483660130719, "step": 6950}, {"loss": 1.0057, "grad_norm": 1.254089593887329, "learning_rate": 0.0002, "epoch": 4.549019607843137, "step": 6960}, {"loss": 1.1723, "grad_norm": 0.9180455803871155, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 6970}, {"loss": 1.0559, "grad_norm": 0.6677682399749756, "learning_rate": 0.0002, "epoch": 4.562091503267974, "step": 6980}, {"loss": 1.0453, "grad_norm": 0.8127354383468628, "learning_rate": 0.0002, "epoch": 4.568627450980392, "step": 6990}, {"loss": 1.0828, "grad_norm": 1.0263001918792725, "learning_rate": 0.0002, "epoch": 4.57516339869281, "step": 7000}, {"loss": 1.0703, "grad_norm": 0.9641909003257751, "learning_rate": 0.0002, "epoch": 4.5816993464052285, "step": 7010}, {"loss": 1.179, "grad_norm": 0.9440861344337463, "learning_rate": 0.0002, "epoch": 4.588235294117647, "step": 7020}, {"loss": 1.0931, "grad_norm": 0.9539011716842651, "learning_rate": 0.0002, "epoch": 4.594771241830065, "step": 7030}, {"loss": 1.0963, "grad_norm": 1.0449910163879395, "learning_rate": 0.0002, "epoch": 4.601307189542483, "step": 7040}, {"loss": 0.9944, "grad_norm": 0.8766893744468689, "learning_rate": 0.0002, "epoch": 4.607843137254902, "step": 7050}, {"loss": 1.0169, "grad_norm": 0.6983462572097778, "learning_rate": 0.0002, "epoch": 4.61437908496732, "step": 7060}, {"loss": 1.1778, "grad_norm": 0.9505505561828613, "learning_rate": 0.0002, "epoch": 4.620915032679738, "step": 7070}, {"loss": 1.121, "grad_norm": 1.2506657838821411, "learning_rate": 0.0002, "epoch": 4.627450980392156, "step": 7080}, {"loss": 1.1329, "grad_norm": 0.9602801203727722, "learning_rate": 0.0002, "epoch": 4.633986928104575, "step": 7090}, {"loss": 1.1499, "grad_norm": 0.7398977875709534, "learning_rate": 0.0002, "epoch": 4.640522875816993, "step": 7100}, {"loss": 1.0769, "grad_norm": 1.3862425088882446, "learning_rate": 0.0002, "epoch": 4.647058823529412, "step": 7110}, {"loss": 1.0571, "grad_norm": 1.1451990604400635, "learning_rate": 0.0002, "epoch": 4.65359477124183, "step": 7120}, {"loss": 1.1271, "grad_norm": 0.9010422229766846, "learning_rate": 0.0002, "epoch": 4.660130718954249, "step": 7130}, {"loss": 1.0165, "grad_norm": 0.7102518081665039, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 7140}, {"loss": 1.0819, "grad_norm": 0.7963796257972717, "learning_rate": 0.0002, "epoch": 4.673202614379085, "step": 7150}, {"loss": 1.1114, "grad_norm": 0.7726007699966431, "learning_rate": 0.0002, "epoch": 4.6797385620915035, "step": 7160}, {"loss": 1.2088, "grad_norm": 0.8097564578056335, "learning_rate": 0.0002, "epoch": 4.686274509803922, "step": 7170}, {"loss": 1.1386, "grad_norm": 0.9070925116539001, "learning_rate": 0.0002, "epoch": 4.69281045751634, "step": 7180}, {"loss": 1.0315, "grad_norm": 0.7543528079986572, "learning_rate": 0.0002, "epoch": 4.699346405228758, "step": 7190}, {"loss": 1.0984, "grad_norm": 0.9900904893875122, "learning_rate": 0.0002, "epoch": 4.705882352941177, "step": 7200}, {"loss": 1.1552, "grad_norm": 0.8033412098884583, "learning_rate": 0.0002, "epoch": 4.712418300653595, "step": 7210}, {"loss": 1.1773, "grad_norm": 0.8440839052200317, "learning_rate": 0.0002, "epoch": 4.718954248366013, "step": 7220}, {"loss": 1.1258, "grad_norm": 0.9325555562973022, "learning_rate": 0.0002, "epoch": 4.7254901960784315, "step": 7230}, {"loss": 1.1384, "grad_norm": 0.7881146669387817, "learning_rate": 0.0002, "epoch": 4.73202614379085, "step": 7240}, {"loss": 1.1219, "grad_norm": 0.884453296661377, "learning_rate": 0.0002, "epoch": 4.738562091503268, "step": 7250}, {"loss": 1.1036, "grad_norm": 0.9274539351463318, "learning_rate": 0.0002, "epoch": 4.745098039215686, "step": 7260}, {"loss": 1.0906, "grad_norm": 1.2367479801177979, "learning_rate": 0.0002, "epoch": 4.751633986928105, "step": 7270}, {"loss": 1.0741, "grad_norm": 0.9499821066856384, "learning_rate": 0.0002, "epoch": 4.758169934640523, "step": 7280}, {"loss": 1.1625, "grad_norm": 2.1918580532073975, "learning_rate": 0.0002, "epoch": 4.764705882352941, "step": 7290}, {"loss": 0.954, "grad_norm": 0.8221880793571472, "learning_rate": 0.0002, "epoch": 4.771241830065359, "step": 7300}, {"loss": 1.1358, "grad_norm": 0.871972918510437, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 7310}, {"loss": 1.0599, "grad_norm": 0.8034510612487793, "learning_rate": 0.0002, "epoch": 4.784313725490196, "step": 7320}, {"loss": 1.1059, "grad_norm": 0.8959605693817139, "learning_rate": 0.0002, "epoch": 4.790849673202614, "step": 7330}, {"loss": 1.0176, "grad_norm": 1.2326215505599976, "learning_rate": 0.0002, "epoch": 4.7973856209150325, "step": 7340}, {"loss": 1.1095, "grad_norm": 0.9725791811943054, "learning_rate": 0.0002, "epoch": 4.803921568627451, "step": 7350}, {"loss": 1.1229, "grad_norm": 0.7240816354751587, "learning_rate": 0.0002, "epoch": 4.810457516339869, "step": 7360}, {"loss": 1.0669, "grad_norm": 0.8265769481658936, "learning_rate": 0.0002, "epoch": 4.816993464052287, "step": 7370}, {"loss": 1.042, "grad_norm": 0.8888696432113647, "learning_rate": 0.0002, "epoch": 4.823529411764706, "step": 7380}, {"loss": 1.0981, "grad_norm": 0.7776556015014648, "learning_rate": 0.0002, "epoch": 4.830065359477124, "step": 7390}, {"loss": 1.0819, "grad_norm": 0.8772371411323547, "learning_rate": 0.0002, "epoch": 4.836601307189542, "step": 7400}, {"loss": 1.0819, "grad_norm": 0.9786531925201416, "learning_rate": 0.0002, "epoch": 4.8431372549019605, "step": 7410}, {"loss": 1.1358, "grad_norm": 0.9059745073318481, "learning_rate": 0.0002, "epoch": 4.849673202614379, "step": 7420}, {"loss": 1.0324, "grad_norm": 0.7422552108764648, "learning_rate": 0.0002, "epoch": 4.856209150326797, "step": 7430}, {"loss": 1.0423, "grad_norm": 1.3040380477905273, "learning_rate": 0.0002, "epoch": 4.862745098039216, "step": 7440}, {"loss": 1.1161, "grad_norm": 1.3278473615646362, "learning_rate": 0.0002, "epoch": 4.8692810457516345, "step": 7450}, {"loss": 1.0713, "grad_norm": 1.2705849409103394, "learning_rate": 0.0002, "epoch": 4.875816993464053, "step": 7460}, {"loss": 1.0034, "grad_norm": 0.8837892413139343, "learning_rate": 0.0002, "epoch": 4.882352941176471, "step": 7470}, {"loss": 1.1716, "grad_norm": 0.8670691251754761, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 7480}, {"loss": 1.1723, "grad_norm": 0.9662758111953735, "learning_rate": 0.0002, "epoch": 4.895424836601308, "step": 7490}, {"loss": 1.1056, "grad_norm": 0.8188302516937256, "learning_rate": 0.0002, "epoch": 4.901960784313726, "step": 7500}, {"loss": 1.0419, "grad_norm": 0.769442617893219, "learning_rate": 0.0002, "epoch": 4.908496732026144, "step": 7510}, {"loss": 1.1671, "grad_norm": 1.1465084552764893, "learning_rate": 0.0002, "epoch": 4.915032679738562, "step": 7520}, {"loss": 1.0768, "grad_norm": 1.253214955329895, "learning_rate": 0.0002, "epoch": 4.921568627450981, "step": 7530}, {"loss": 1.011, "grad_norm": 0.7922375202178955, "learning_rate": 0.0002, "epoch": 4.928104575163399, "step": 7540}, {"loss": 1.1256, "grad_norm": 0.8306851387023926, "learning_rate": 0.0002, "epoch": 4.934640522875817, "step": 7550}, {"loss": 1.206, "grad_norm": 0.8486151099205017, "learning_rate": 0.0002, "epoch": 4.9411764705882355, "step": 7560}, {"loss": 1.0161, "grad_norm": 1.2601467370986938, "learning_rate": 0.0002, "epoch": 4.947712418300654, "step": 7570}, {"loss": 1.1078, "grad_norm": 0.7980747818946838, "learning_rate": 0.0002, "epoch": 4.954248366013072, "step": 7580}, {"loss": 1.0607, "grad_norm": 0.8653254508972168, "learning_rate": 0.0002, "epoch": 4.96078431372549, "step": 7590}, {"loss": 1.0292, "grad_norm": 0.9680571556091309, "learning_rate": 0.0002, "epoch": 4.967320261437909, "step": 7600}, {"loss": 1.1795, "grad_norm": 0.9554466605186462, "learning_rate": 0.0002, "epoch": 4.973856209150327, "step": 7610}, {"loss": 1.0935, "grad_norm": 1.3693897724151611, "learning_rate": 0.0002, "epoch": 4.980392156862745, "step": 7620}, {"loss": 1.0838, "grad_norm": 0.7809282541275024, "learning_rate": 0.0002, "epoch": 4.9869281045751634, "step": 7630}, {"loss": 1.0844, "grad_norm": 0.7528006434440613, "learning_rate": 0.0002, "epoch": 4.993464052287582, "step": 7640}, {"loss": 0.9951, "grad_norm": 1.7491309642791748, "learning_rate": 0.0002, "epoch": 5.0, "step": 7650}, {"eval_loss": 1.4197258949279785, "eval_runtime": 33.6327, "eval_samples_per_second": 12.964, "eval_steps_per_second": 1.635, "epoch": 5.0, "step": 7650}, {"loss": 0.9744, "grad_norm": 0.8840063214302063, "learning_rate": 0.0002, "epoch": 5.006535947712418, "step": 7660}, {"loss": 1.0274, "grad_norm": 1.0118401050567627, "learning_rate": 0.0002, "epoch": 5.0130718954248366, "step": 7670}, {"loss": 1.1667, "grad_norm": 1.0040518045425415, "learning_rate": 0.0002, "epoch": 5.019607843137255, "step": 7680}, {"loss": 0.9426, "grad_norm": 0.7541199922561646, "learning_rate": 0.0002, "epoch": 5.026143790849673, "step": 7690}, {"loss": 1.0797, "grad_norm": 0.9106482863426208, "learning_rate": 0.0002, "epoch": 5.032679738562091, "step": 7700}, {"loss": 1.0096, "grad_norm": 1.3691469430923462, "learning_rate": 0.0002, "epoch": 5.03921568627451, "step": 7710}, {"loss": 0.9889, "grad_norm": 0.9449689388275146, "learning_rate": 0.0002, "epoch": 5.045751633986928, "step": 7720}, {"loss": 0.9087, "grad_norm": 1.1678508520126343, "learning_rate": 0.0002, "epoch": 5.052287581699346, "step": 7730}, {"loss": 1.0556, "grad_norm": 1.1296145915985107, "learning_rate": 0.0002, "epoch": 5.0588235294117645, "step": 7740}, {"loss": 0.9339, "grad_norm": 0.7863904237747192, "learning_rate": 0.0002, "epoch": 5.065359477124183, "step": 7750}, {"loss": 1.0135, "grad_norm": 0.8691433072090149, "learning_rate": 0.0002, "epoch": 5.071895424836601, "step": 7760}, {"loss": 0.9776, "grad_norm": 1.0722088813781738, "learning_rate": 0.0002, "epoch": 5.078431372549019, "step": 7770}, {"loss": 1.0595, "grad_norm": 0.9625038504600525, "learning_rate": 0.0002, "epoch": 5.084967320261438, "step": 7780}, {"loss": 1.0241, "grad_norm": 1.2618783712387085, "learning_rate": 0.0002, "epoch": 5.091503267973856, "step": 7790}, {"loss": 0.9396, "grad_norm": 0.9970650672912598, "learning_rate": 0.0002, "epoch": 5.098039215686274, "step": 7800}, {"loss": 0.9186, "grad_norm": 1.3946677446365356, "learning_rate": 0.0002, "epoch": 5.104575163398692, "step": 7810}, {"loss": 0.9957, "grad_norm": 1.0260052680969238, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 7820}, {"loss": 0.9865, "grad_norm": 1.105521559715271, "learning_rate": 0.0002, "epoch": 5.117647058823529, "step": 7830}, {"loss": 0.9788, "grad_norm": 1.003641128540039, "learning_rate": 0.0002, "epoch": 5.124183006535947, "step": 7840}, {"loss": 0.9688, "grad_norm": 1.0315021276474, "learning_rate": 0.0002, "epoch": 5.130718954248366, "step": 7850}, {"loss": 1.0001, "grad_norm": 0.9469530582427979, "learning_rate": 0.0002, "epoch": 5.137254901960785, "step": 7860}, {"loss": 0.9659, "grad_norm": 1.3244667053222656, "learning_rate": 0.0002, "epoch": 5.143790849673203, "step": 7870}, {"loss": 0.9657, "grad_norm": 1.1732033491134644, "learning_rate": 0.0002, "epoch": 5.150326797385621, "step": 7880}, {"loss": 0.9978, "grad_norm": 1.3129149675369263, "learning_rate": 0.0002, "epoch": 5.1568627450980395, "step": 7890}, {"loss": 0.9894, "grad_norm": 0.8589454293251038, "learning_rate": 0.0002, "epoch": 5.163398692810458, "step": 7900}, {"loss": 1.0161, "grad_norm": 0.8954233527183533, "learning_rate": 0.0002, "epoch": 5.169934640522876, "step": 7910}, {"loss": 0.8741, "grad_norm": 0.7426522970199585, "learning_rate": 0.0002, "epoch": 5.176470588235294, "step": 7920}, {"loss": 1.0106, "grad_norm": 1.1990121603012085, "learning_rate": 0.0002, "epoch": 5.183006535947713, "step": 7930}, {"loss": 0.9453, "grad_norm": 0.8867580890655518, "learning_rate": 0.0002, "epoch": 5.189542483660131, "step": 7940}, {"loss": 0.9727, "grad_norm": 1.016276478767395, "learning_rate": 0.0002, "epoch": 5.196078431372549, "step": 7950}, {"loss": 0.9908, "grad_norm": 1.0210685729980469, "learning_rate": 0.0002, "epoch": 5.2026143790849675, "step": 7960}, {"loss": 1.0522, "grad_norm": 1.0093122720718384, "learning_rate": 0.0002, "epoch": 5.209150326797386, "step": 7970}, {"loss": 1.0055, "grad_norm": 0.9746801853179932, "learning_rate": 0.0002, "epoch": 5.215686274509804, "step": 7980}, {"loss": 1.0611, "grad_norm": 0.9113537073135376, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 7990}, {"loss": 0.9167, "grad_norm": 1.2782206535339355, "learning_rate": 0.0002, "epoch": 5.228758169934641, "step": 8000}, {"loss": 1.0212, "grad_norm": 1.3223118782043457, "learning_rate": 0.0002, "epoch": 5.235294117647059, "step": 8010}, {"loss": 0.9244, "grad_norm": 0.7898629307746887, "learning_rate": 0.0002, "epoch": 5.241830065359477, "step": 8020}, {"loss": 1.0574, "grad_norm": 0.9822350740432739, "learning_rate": 0.0002, "epoch": 5.248366013071895, "step": 8030}, {"loss": 1.0102, "grad_norm": 1.5114340782165527, "learning_rate": 0.0002, "epoch": 5.254901960784314, "step": 8040}, {"loss": 0.9816, "grad_norm": 0.859006941318512, "learning_rate": 0.0002, "epoch": 5.261437908496732, "step": 8050}, {"loss": 0.9445, "grad_norm": 1.0495043992996216, "learning_rate": 0.0002, "epoch": 5.26797385620915, "step": 8060}, {"loss": 0.9724, "grad_norm": 1.329483151435852, "learning_rate": 0.0002, "epoch": 5.2745098039215685, "step": 8070}, {"loss": 0.9296, "grad_norm": 1.1333061456680298, "learning_rate": 0.0002, "epoch": 5.281045751633987, "step": 8080}, {"loss": 0.9577, "grad_norm": 0.8153108358383179, "learning_rate": 0.0002, "epoch": 5.287581699346405, "step": 8090}, {"loss": 0.9002, "grad_norm": 0.9395004510879517, "learning_rate": 0.0002, "epoch": 5.294117647058823, "step": 8100}, {"loss": 1.0371, "grad_norm": 0.8907593488693237, "learning_rate": 0.0002, "epoch": 5.300653594771242, "step": 8110}, {"loss": 0.9301, "grad_norm": 0.9808667898178101, "learning_rate": 0.0002, "epoch": 5.30718954248366, "step": 8120}, {"loss": 1.0136, "grad_norm": 0.984779417514801, "learning_rate": 0.0002, "epoch": 5.313725490196078, "step": 8130}, {"loss": 0.9621, "grad_norm": 0.9787270426750183, "learning_rate": 0.0002, "epoch": 5.3202614379084965, "step": 8140}, {"loss": 0.9336, "grad_norm": 0.9857710599899292, "learning_rate": 0.0002, "epoch": 5.326797385620915, "step": 8150}, {"loss": 0.9884, "grad_norm": 0.9774303436279297, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 8160}, {"loss": 1.0561, "grad_norm": 0.677925169467926, "learning_rate": 0.0002, "epoch": 5.339869281045751, "step": 8170}, {"loss": 1.1345, "grad_norm": 0.9576456546783447, "learning_rate": 0.0002, "epoch": 5.34640522875817, "step": 8180}, {"loss": 0.9554, "grad_norm": 1.8970937728881836, "learning_rate": 0.0002, "epoch": 5.352941176470588, "step": 8190}, {"loss": 1.0474, "grad_norm": 0.9458389282226562, "learning_rate": 0.0002, "epoch": 5.359477124183006, "step": 8200}, {"loss": 1.0365, "grad_norm": 1.761794924736023, "learning_rate": 0.0002, "epoch": 5.366013071895424, "step": 8210}, {"loss": 0.9426, "grad_norm": 1.0693724155426025, "learning_rate": 0.0002, "epoch": 5.372549019607844, "step": 8220}, {"loss": 1.0299, "grad_norm": 0.9025877714157104, "learning_rate": 0.0002, "epoch": 5.379084967320262, "step": 8230}, {"loss": 0.9652, "grad_norm": 1.258857250213623, "learning_rate": 0.0002, "epoch": 5.38562091503268, "step": 8240}, {"loss": 0.9735, "grad_norm": 1.084849238395691, "learning_rate": 0.0002, "epoch": 5.392156862745098, "step": 8250}, {"loss": 0.9999, "grad_norm": 0.9530340433120728, "learning_rate": 0.0002, "epoch": 5.398692810457517, "step": 8260}, {"loss": 1.0268, "grad_norm": 0.830240786075592, "learning_rate": 0.0002, "epoch": 5.405228758169935, "step": 8270}, {"loss": 1.0332, "grad_norm": 1.5807015895843506, "learning_rate": 0.0002, "epoch": 5.411764705882353, "step": 8280}, {"loss": 0.9146, "grad_norm": 0.9486905336380005, "learning_rate": 0.0002, "epoch": 5.4183006535947715, "step": 8290}, {"loss": 1.0336, "grad_norm": 1.0415093898773193, "learning_rate": 0.0002, "epoch": 5.42483660130719, "step": 8300}, {"loss": 0.8933, "grad_norm": 1.0501102209091187, "learning_rate": 0.0002, "epoch": 5.431372549019608, "step": 8310}, {"loss": 0.9983, "grad_norm": 0.9751836061477661, "learning_rate": 0.0002, "epoch": 5.437908496732026, "step": 8320}, {"loss": 1.0755, "grad_norm": 1.5529173612594604, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 8330}, {"loss": 0.9814, "grad_norm": 0.8314350247383118, "learning_rate": 0.0002, "epoch": 5.450980392156863, "step": 8340}, {"loss": 1.0596, "grad_norm": 1.2555103302001953, "learning_rate": 0.0002, "epoch": 5.457516339869281, "step": 8350}, {"loss": 1.0127, "grad_norm": 0.9408367872238159, "learning_rate": 0.0002, "epoch": 5.4640522875816995, "step": 8360}, {"loss": 0.9241, "grad_norm": 0.9483312964439392, "learning_rate": 0.0002, "epoch": 5.470588235294118, "step": 8370}, {"loss": 0.9678, "grad_norm": 0.957905650138855, "learning_rate": 0.0002, "epoch": 5.477124183006536, "step": 8380}, {"loss": 1.0985, "grad_norm": 1.4000147581100464, "learning_rate": 0.0002, "epoch": 5.483660130718954, "step": 8390}, {"loss": 0.9966, "grad_norm": 1.7032461166381836, "learning_rate": 0.0002, "epoch": 5.490196078431373, "step": 8400}, {"loss": 0.9539, "grad_norm": 0.8978716731071472, "learning_rate": 0.0002, "epoch": 5.496732026143791, "step": 8410}, {"loss": 0.9544, "grad_norm": 0.8659300804138184, "learning_rate": 0.0002, "epoch": 5.503267973856209, "step": 8420}, {"loss": 1.0526, "grad_norm": 1.3629727363586426, "learning_rate": 0.0002, "epoch": 5.509803921568627, "step": 8430}, {"loss": 0.9696, "grad_norm": 1.2741984128952026, "learning_rate": 0.0002, "epoch": 5.516339869281046, "step": 8440}, {"loss": 1.0191, "grad_norm": 1.3867180347442627, "learning_rate": 0.0002, "epoch": 5.522875816993464, "step": 8450}, {"loss": 1.0835, "grad_norm": 1.0662001371383667, "learning_rate": 0.0002, "epoch": 5.529411764705882, "step": 8460}, {"loss": 0.9779, "grad_norm": 1.7005380392074585, "learning_rate": 0.0002, "epoch": 5.5359477124183005, "step": 8470}, {"loss": 1.0221, "grad_norm": 1.3730385303497314, "learning_rate": 0.0002, "epoch": 5.542483660130719, "step": 8480}, {"loss": 0.9586, "grad_norm": 1.7737441062927246, "learning_rate": 0.0002, "epoch": 5.549019607843137, "step": 8490}, {"loss": 0.9729, "grad_norm": 0.907487690448761, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 8500}, {"loss": 0.9891, "grad_norm": 0.8882441520690918, "learning_rate": 0.0002, "epoch": 5.562091503267974, "step": 8510}, {"loss": 0.973, "grad_norm": 0.8655388951301575, "learning_rate": 0.0002, "epoch": 5.568627450980392, "step": 8520}, {"loss": 0.9523, "grad_norm": 1.379992961883545, "learning_rate": 0.0002, "epoch": 5.57516339869281, "step": 8530}, {"loss": 1.0174, "grad_norm": 1.0021201372146606, "learning_rate": 0.0002, "epoch": 5.5816993464052285, "step": 8540}, {"loss": 1.0113, "grad_norm": 1.2636926174163818, "learning_rate": 0.0002, "epoch": 5.588235294117647, "step": 8550}, {"loss": 1.0243, "grad_norm": 1.279025912284851, "learning_rate": 0.0002, "epoch": 5.594771241830065, "step": 8560}, {"loss": 0.9917, "grad_norm": 0.8885834217071533, "learning_rate": 0.0002, "epoch": 5.601307189542483, "step": 8570}, {"loss": 0.9849, "grad_norm": 1.1975032091140747, "learning_rate": 0.0002, "epoch": 5.607843137254902, "step": 8580}, {"loss": 1.0363, "grad_norm": 1.005470871925354, "learning_rate": 0.0002, "epoch": 5.61437908496732, "step": 8590}, {"loss": 0.9947, "grad_norm": 1.104286551475525, "learning_rate": 0.0002, "epoch": 5.620915032679738, "step": 8600}, {"loss": 1.0585, "grad_norm": 1.435445785522461, "learning_rate": 0.0002, "epoch": 5.627450980392156, "step": 8610}, {"loss": 0.9156, "grad_norm": 1.0270172357559204, "learning_rate": 0.0002, "epoch": 5.633986928104575, "step": 8620}, {"loss": 1.0522, "grad_norm": 1.0929527282714844, "learning_rate": 0.0002, "epoch": 5.640522875816993, "step": 8630}, {"loss": 0.9694, "grad_norm": 1.1061221361160278, "learning_rate": 0.0002, "epoch": 5.647058823529412, "step": 8640}, {"loss": 1.0826, "grad_norm": 0.9563149213790894, "learning_rate": 0.0002, "epoch": 5.65359477124183, "step": 8650}, {"loss": 1.0042, "grad_norm": 1.0434954166412354, "learning_rate": 0.0002, "epoch": 5.660130718954249, "step": 8660}, {"loss": 0.9463, "grad_norm": 1.3695117235183716, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 8670}, {"loss": 0.9441, "grad_norm": 1.0540564060211182, "learning_rate": 0.0002, "epoch": 5.673202614379085, "step": 8680}, {"loss": 0.9755, "grad_norm": 1.5942492485046387, "learning_rate": 0.0002, "epoch": 5.6797385620915035, "step": 8690}, {"loss": 1.0071, "grad_norm": 0.9485495090484619, "learning_rate": 0.0002, "epoch": 5.686274509803922, "step": 8700}, {"loss": 0.9998, "grad_norm": 1.1483162641525269, "learning_rate": 0.0002, "epoch": 5.69281045751634, "step": 8710}, {"loss": 0.9578, "grad_norm": 0.9075471758842468, "learning_rate": 0.0002, "epoch": 5.699346405228758, "step": 8720}, {"loss": 0.9488, "grad_norm": 1.7908551692962646, "learning_rate": 0.0002, "epoch": 5.705882352941177, "step": 8730}, {"loss": 1.0163, "grad_norm": 0.8867162466049194, "learning_rate": 0.0002, "epoch": 5.712418300653595, "step": 8740}, {"loss": 1.0041, "grad_norm": 1.7165148258209229, "learning_rate": 0.0002, "epoch": 5.718954248366013, "step": 8750}, {"loss": 1.1061, "grad_norm": 0.9529356956481934, "learning_rate": 0.0002, "epoch": 5.7254901960784315, "step": 8760}, {"loss": 1.1119, "grad_norm": 1.01852548122406, "learning_rate": 0.0002, "epoch": 5.73202614379085, "step": 8770}, {"loss": 1.0471, "grad_norm": 0.9538423418998718, "learning_rate": 0.0002, "epoch": 5.738562091503268, "step": 8780}, {"loss": 1.0913, "grad_norm": 0.9007737636566162, "learning_rate": 0.0002, "epoch": 5.745098039215686, "step": 8790}, {"loss": 0.9766, "grad_norm": 0.9107874035835266, "learning_rate": 0.0002, "epoch": 5.751633986928105, "step": 8800}, {"loss": 0.9212, "grad_norm": 0.7379238605499268, "learning_rate": 0.0002, "epoch": 5.758169934640523, "step": 8810}, {"loss": 1.0966, "grad_norm": 1.072645902633667, "learning_rate": 0.0002, "epoch": 5.764705882352941, "step": 8820}, {"loss": 1.0845, "grad_norm": 1.002008080482483, "learning_rate": 0.0002, "epoch": 5.771241830065359, "step": 8830}, {"loss": 0.9978, "grad_norm": 1.0435924530029297, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 8840}, {"loss": 0.9458, "grad_norm": 0.9874551296234131, "learning_rate": 0.0002, "epoch": 5.784313725490196, "step": 8850}, {"loss": 1.1241, "grad_norm": 1.1729662418365479, "learning_rate": 0.0002, "epoch": 5.790849673202614, "step": 8860}, {"loss": 1.0451, "grad_norm": 1.3300775289535522, "learning_rate": 0.0002, "epoch": 5.7973856209150325, "step": 8870}, {"loss": 1.0989, "grad_norm": 1.612707257270813, "learning_rate": 0.0002, "epoch": 5.803921568627451, "step": 8880}, {"loss": 0.9119, "grad_norm": 0.9047797322273254, "learning_rate": 0.0002, "epoch": 5.810457516339869, "step": 8890}, {"loss": 0.989, "grad_norm": 1.0958741903305054, "learning_rate": 0.0002, "epoch": 5.816993464052287, "step": 8900}, {"loss": 1.1922, "grad_norm": 1.0099612474441528, "learning_rate": 0.0002, "epoch": 5.823529411764706, "step": 8910}, {"loss": 1.0623, "grad_norm": 0.8442328572273254, "learning_rate": 0.0002, "epoch": 5.830065359477124, "step": 8920}, {"loss": 0.9134, "grad_norm": 1.1388301849365234, "learning_rate": 0.0002, "epoch": 5.836601307189542, "step": 8930}, {"loss": 1.0019, "grad_norm": 0.8296026587486267, "learning_rate": 0.0002, "epoch": 5.8431372549019605, "step": 8940}, {"loss": 1.0363, "grad_norm": 1.0843533277511597, "learning_rate": 0.0002, "epoch": 5.849673202614379, "step": 8950}, {"loss": 1.0009, "grad_norm": 0.8496834635734558, "learning_rate": 0.0002, "epoch": 5.856209150326797, "step": 8960}, {"loss": 0.9927, "grad_norm": 1.6894690990447998, "learning_rate": 0.0002, "epoch": 5.862745098039216, "step": 8970}, {"loss": 1.0939, "grad_norm": 1.0012282133102417, "learning_rate": 0.0002, "epoch": 5.8692810457516345, "step": 8980}, {"loss": 0.9722, "grad_norm": 0.8521103262901306, "learning_rate": 0.0002, "epoch": 5.875816993464053, "step": 8990}, {"loss": 1.0885, "grad_norm": 1.246841311454773, "learning_rate": 0.0002, "epoch": 5.882352941176471, "step": 9000}, {"loss": 0.9702, "grad_norm": 0.9941892027854919, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 9010}, {"loss": 0.8754, "grad_norm": 1.067413568496704, "learning_rate": 0.0002, "epoch": 5.895424836601308, "step": 9020}, {"loss": 1.0153, "grad_norm": 1.0045088529586792, "learning_rate": 0.0002, "epoch": 5.901960784313726, "step": 9030}, {"loss": 1.0134, "grad_norm": 1.383063554763794, "learning_rate": 0.0002, "epoch": 5.908496732026144, "step": 9040}, {"loss": 1.0845, "grad_norm": 0.8754428625106812, "learning_rate": 0.0002, "epoch": 5.915032679738562, "step": 9050}, {"loss": 0.9571, "grad_norm": 0.8577388525009155, "learning_rate": 0.0002, "epoch": 5.921568627450981, "step": 9060}, {"loss": 1.0532, "grad_norm": 0.8718975186347961, "learning_rate": 0.0002, "epoch": 5.928104575163399, "step": 9070}, {"loss": 1.0667, "grad_norm": 1.1762131452560425, "learning_rate": 0.0002, "epoch": 5.934640522875817, "step": 9080}, {"loss": 1.1114, "grad_norm": 1.1025866270065308, "learning_rate": 0.0002, "epoch": 5.9411764705882355, "step": 9090}, {"loss": 0.9155, "grad_norm": 1.0439870357513428, "learning_rate": 0.0002, "epoch": 5.947712418300654, "step": 9100}, {"loss": 1.0055, "grad_norm": 1.2411525249481201, "learning_rate": 0.0002, "epoch": 5.954248366013072, "step": 9110}, {"loss": 0.9747, "grad_norm": 1.0317714214324951, "learning_rate": 0.0002, "epoch": 5.96078431372549, "step": 9120}, {"loss": 1.0352, "grad_norm": 0.9880492091178894, "learning_rate": 0.0002, "epoch": 5.967320261437909, "step": 9130}, {"loss": 1.0459, "grad_norm": 0.9039815664291382, "learning_rate": 0.0002, "epoch": 5.973856209150327, "step": 9140}, {"loss": 1.0413, "grad_norm": 0.9049116373062134, "learning_rate": 0.0002, "epoch": 5.980392156862745, "step": 9150}, {"loss": 0.9792, "grad_norm": 0.996749222278595, "learning_rate": 0.0002, "epoch": 5.9869281045751634, "step": 9160}, {"loss": 0.8857, "grad_norm": 0.8716062307357788, "learning_rate": 0.0002, "epoch": 5.993464052287582, "step": 9170}, {"loss": 1.019, "grad_norm": 1.3081294298171997, "learning_rate": 0.0002, "epoch": 6.0, "step": 9180}, {"eval_loss": 1.45111083984375, "eval_runtime": 34.7121, "eval_samples_per_second": 12.56, "eval_steps_per_second": 1.584, "epoch": 6.0, "step": 9180}, {"loss": 0.9306, "grad_norm": 1.1378029584884644, "learning_rate": 0.0002, "epoch": 6.006535947712418, "step": 9190}, {"loss": 0.8794, "grad_norm": 1.2921233177185059, "learning_rate": 0.0002, "epoch": 6.0130718954248366, "step": 9200}, {"loss": 0.8145, "grad_norm": 1.039211630821228, "learning_rate": 0.0002, "epoch": 6.019607843137255, "step": 9210}, {"loss": 0.8524, "grad_norm": 0.9715196490287781, "learning_rate": 0.0002, "epoch": 6.026143790849673, "step": 9220}, {"loss": 1.035, "grad_norm": 1.220642328262329, "learning_rate": 0.0002, "epoch": 6.032679738562091, "step": 9230}, {"loss": 0.8468, "grad_norm": 0.854360044002533, "learning_rate": 0.0002, "epoch": 6.03921568627451, "step": 9240}, {"loss": 0.8534, "grad_norm": 0.8806933164596558, "learning_rate": 0.0002, "epoch": 6.045751633986928, "step": 9250}, {"loss": 0.8305, "grad_norm": 1.4315874576568604, "learning_rate": 0.0002, "epoch": 6.052287581699346, "step": 9260}, {"loss": 0.8462, "grad_norm": 0.9382007122039795, "learning_rate": 0.0002, "epoch": 6.0588235294117645, "step": 9270}, {"loss": 0.9653, "grad_norm": 1.2184561491012573, "learning_rate": 0.0002, "epoch": 6.065359477124183, "step": 9280}, {"loss": 0.8806, "grad_norm": 1.2331548929214478, "learning_rate": 0.0002, "epoch": 6.071895424836601, "step": 9290}, {"loss": 0.8354, "grad_norm": 1.1112796068191528, "learning_rate": 0.0002, "epoch": 6.078431372549019, "step": 9300}, {"loss": 0.8008, "grad_norm": 1.4753731489181519, "learning_rate": 0.0002, "epoch": 6.084967320261438, "step": 9310}, {"loss": 0.9198, "grad_norm": 1.2783401012420654, "learning_rate": 0.0002, "epoch": 6.091503267973856, "step": 9320}, {"loss": 0.8294, "grad_norm": 0.9916909337043762, "learning_rate": 0.0002, "epoch": 6.098039215686274, "step": 9330}, {"loss": 0.876, "grad_norm": 0.9300099015235901, "learning_rate": 0.0002, "epoch": 6.104575163398692, "step": 9340}, {"loss": 0.9064, "grad_norm": 1.4985264539718628, "learning_rate": 0.0002, "epoch": 6.111111111111111, "step": 9350}, {"loss": 1.0106, "grad_norm": 1.276380181312561, "learning_rate": 0.0002, "epoch": 6.117647058823529, "step": 9360}, {"loss": 0.9068, "grad_norm": 1.181113600730896, "learning_rate": 0.0002, "epoch": 6.124183006535947, "step": 9370}, {"loss": 0.9165, "grad_norm": 1.698729395866394, "learning_rate": 0.0002, "epoch": 6.130718954248366, "step": 9380}, {"loss": 0.7997, "grad_norm": 0.9793189764022827, "learning_rate": 0.0002, "epoch": 6.137254901960785, "step": 9390}, {"loss": 0.9731, "grad_norm": 1.1942132711410522, "learning_rate": 0.0002, "epoch": 6.143790849673203, "step": 9400}, {"loss": 0.8762, "grad_norm": 1.2160184383392334, "learning_rate": 0.0002, "epoch": 6.150326797385621, "step": 9410}, {"loss": 0.801, "grad_norm": 1.0802825689315796, "learning_rate": 0.0002, "epoch": 6.1568627450980395, "step": 9420}, {"loss": 0.9055, "grad_norm": 3.024529218673706, "learning_rate": 0.0002, "epoch": 6.163398692810458, "step": 9430}, {"loss": 0.8739, "grad_norm": 0.975062370300293, "learning_rate": 0.0002, "epoch": 6.169934640522876, "step": 9440}, {"loss": 0.8485, "grad_norm": 0.9243306517601013, "learning_rate": 0.0002, "epoch": 6.176470588235294, "step": 9450}, {"loss": 0.947, "grad_norm": 0.8892099857330322, "learning_rate": 0.0002, "epoch": 6.183006535947713, "step": 9460}, {"loss": 0.9165, "grad_norm": 1.4151731729507446, "learning_rate": 0.0002, "epoch": 6.189542483660131, "step": 9470}, {"loss": 1.022, "grad_norm": 1.064701795578003, "learning_rate": 0.0002, "epoch": 6.196078431372549, "step": 9480}, {"loss": 0.906, "grad_norm": 1.1104519367218018, "learning_rate": 0.0002, "epoch": 6.2026143790849675, "step": 9490}, {"loss": 0.9572, "grad_norm": 1.4788947105407715, "learning_rate": 0.0002, "epoch": 6.209150326797386, "step": 9500}, {"loss": 0.8014, "grad_norm": 0.7976077795028687, "learning_rate": 0.0002, "epoch": 6.215686274509804, "step": 9510}, {"loss": 0.886, "grad_norm": 1.256864070892334, "learning_rate": 0.0002, "epoch": 6.222222222222222, "step": 9520}, {"loss": 0.9104, "grad_norm": 1.3874554634094238, "learning_rate": 0.0002, "epoch": 6.228758169934641, "step": 9530}, {"loss": 0.8583, "grad_norm": 1.9012963771820068, "learning_rate": 0.0002, "epoch": 6.235294117647059, "step": 9540}, {"loss": 0.9585, "grad_norm": 1.275212287902832, "learning_rate": 0.0002, "epoch": 6.241830065359477, "step": 9550}, {"loss": 0.8416, "grad_norm": 1.1007417440414429, "learning_rate": 0.0002, "epoch": 6.248366013071895, "step": 9560}, {"loss": 0.9191, "grad_norm": 1.0602147579193115, "learning_rate": 0.0002, "epoch": 6.254901960784314, "step": 9570}, {"loss": 0.909, "grad_norm": 1.2276418209075928, "learning_rate": 0.0002, "epoch": 6.261437908496732, "step": 9580}, {"loss": 0.9363, "grad_norm": 1.0111924409866333, "learning_rate": 0.0002, "epoch": 6.26797385620915, "step": 9590}, {"loss": 0.9941, "grad_norm": 0.9031485915184021, "learning_rate": 0.0002, "epoch": 6.2745098039215685, "step": 9600}, {"loss": 0.9138, "grad_norm": 0.9893783926963806, "learning_rate": 0.0002, "epoch": 6.281045751633987, "step": 9610}, {"loss": 0.9114, "grad_norm": 1.1979725360870361, "learning_rate": 0.0002, "epoch": 6.287581699346405, "step": 9620}, {"loss": 0.8858, "grad_norm": 1.380516767501831, "learning_rate": 0.0002, "epoch": 6.294117647058823, "step": 9630}, {"loss": 0.8898, "grad_norm": 1.1370083093643188, "learning_rate": 0.0002, "epoch": 6.300653594771242, "step": 9640}, {"loss": 0.9073, "grad_norm": 1.4091558456420898, "learning_rate": 0.0002, "epoch": 6.30718954248366, "step": 9650}, {"loss": 0.9096, "grad_norm": 1.0670944452285767, "learning_rate": 0.0002, "epoch": 6.313725490196078, "step": 9660}, {"loss": 0.9376, "grad_norm": 0.9150263667106628, "learning_rate": 0.0002, "epoch": 6.3202614379084965, "step": 9670}, {"loss": 0.9169, "grad_norm": 1.1342853307724, "learning_rate": 0.0002, "epoch": 6.326797385620915, "step": 9680}, {"loss": 1.002, "grad_norm": 1.2733415365219116, "learning_rate": 0.0002, "epoch": 6.333333333333333, "step": 9690}, {"loss": 0.9579, "grad_norm": 1.3647292852401733, "learning_rate": 0.0002, "epoch": 6.339869281045751, "step": 9700}, {"loss": 0.87, "grad_norm": 1.0435094833374023, "learning_rate": 0.0002, "epoch": 6.34640522875817, "step": 9710}, {"loss": 0.8812, "grad_norm": 1.3641071319580078, "learning_rate": 0.0002, "epoch": 6.352941176470588, "step": 9720}, {"loss": 0.8888, "grad_norm": 1.2806159257888794, "learning_rate": 0.0002, "epoch": 6.359477124183006, "step": 9730}, {"loss": 0.9481, "grad_norm": 1.0193076133728027, "learning_rate": 0.0002, "epoch": 6.366013071895424, "step": 9740}, {"loss": 0.931, "grad_norm": 1.2349408864974976, "learning_rate": 0.0002, "epoch": 6.372549019607844, "step": 9750}, {"loss": 0.8837, "grad_norm": 1.2062549591064453, "learning_rate": 0.0002, "epoch": 6.379084967320262, "step": 9760}, {"loss": 0.8947, "grad_norm": 1.4402194023132324, "learning_rate": 0.0002, "epoch": 6.38562091503268, "step": 9770}, {"loss": 0.8724, "grad_norm": 1.1730891466140747, "learning_rate": 0.0002, "epoch": 6.392156862745098, "step": 9780}, {"loss": 0.9005, "grad_norm": 1.1481093168258667, "learning_rate": 0.0002, "epoch": 6.398692810457517, "step": 9790}, {"loss": 0.9431, "grad_norm": 1.0012723207473755, "learning_rate": 0.0002, "epoch": 6.405228758169935, "step": 9800}, {"loss": 0.8856, "grad_norm": 0.8839848041534424, "learning_rate": 0.0002, "epoch": 6.411764705882353, "step": 9810}, {"loss": 0.8147, "grad_norm": 1.096693992614746, "learning_rate": 0.0002, "epoch": 6.4183006535947715, "step": 9820}, {"loss": 0.846, "grad_norm": 1.4713369607925415, "learning_rate": 0.0002, "epoch": 6.42483660130719, "step": 9830}, {"loss": 0.9563, "grad_norm": 1.2529761791229248, "learning_rate": 0.0002, "epoch": 6.431372549019608, "step": 9840}, {"loss": 0.8551, "grad_norm": 1.5575600862503052, "learning_rate": 0.0002, "epoch": 6.437908496732026, "step": 9850}, {"loss": 0.836, "grad_norm": 1.2188916206359863, "learning_rate": 0.0002, "epoch": 6.444444444444445, "step": 9860}, {"loss": 0.9132, "grad_norm": 1.1558794975280762, "learning_rate": 0.0002, "epoch": 6.450980392156863, "step": 9870}, {"loss": 0.8632, "grad_norm": 1.1506937742233276, "learning_rate": 0.0002, "epoch": 6.457516339869281, "step": 9880}, {"loss": 1.0575, "grad_norm": 1.1168335676193237, "learning_rate": 0.0002, "epoch": 6.4640522875816995, "step": 9890}, {"loss": 0.99, "grad_norm": 1.192449688911438, "learning_rate": 0.0002, "epoch": 6.470588235294118, "step": 9900}, {"loss": 0.9478, "grad_norm": 1.0451104640960693, "learning_rate": 0.0002, "epoch": 6.477124183006536, "step": 9910}, {"loss": 0.9034, "grad_norm": 1.1111775636672974, "learning_rate": 0.0002, "epoch": 6.483660130718954, "step": 9920}, {"loss": 0.8971, "grad_norm": 1.2094531059265137, "learning_rate": 0.0002, "epoch": 6.490196078431373, "step": 9930}, {"loss": 0.9047, "grad_norm": 1.0547380447387695, "learning_rate": 0.0002, "epoch": 6.496732026143791, "step": 9940}, {"loss": 1.0727, "grad_norm": 1.5547202825546265, "learning_rate": 0.0002, "epoch": 6.503267973856209, "step": 9950}, {"loss": 0.9109, "grad_norm": 1.1917903423309326, "learning_rate": 0.0002, "epoch": 6.509803921568627, "step": 9960}, {"loss": 0.8708, "grad_norm": 1.0918153524398804, "learning_rate": 0.0002, "epoch": 6.516339869281046, "step": 9970}, {"loss": 0.8752, "grad_norm": 1.146968960762024, "learning_rate": 0.0002, "epoch": 6.522875816993464, "step": 9980}, {"loss": 0.9593, "grad_norm": 0.9899234771728516, "learning_rate": 0.0002, "epoch": 6.529411764705882, "step": 9990}, {"loss": 0.91, "grad_norm": 2.160924196243286, "learning_rate": 0.0002, "epoch": 6.5359477124183005, "step": 10000}, {"loss": 0.9683, "grad_norm": 1.6366891860961914, "learning_rate": 0.0002, "epoch": 6.542483660130719, "step": 10010}, {"loss": 0.8582, "grad_norm": 0.9876762628555298, "learning_rate": 0.0002, "epoch": 6.549019607843137, "step": 10020}, {"loss": 0.8385, "grad_norm": 1.5622549057006836, "learning_rate": 0.0002, "epoch": 6.555555555555555, "step": 10030}, {"loss": 0.8791, "grad_norm": 1.0108020305633545, "learning_rate": 0.0002, "epoch": 6.562091503267974, "step": 10040}, {"loss": 0.9574, "grad_norm": 1.0725725889205933, "learning_rate": 0.0002, "epoch": 6.568627450980392, "step": 10050}, {"loss": 0.8297, "grad_norm": 1.1551216840744019, "learning_rate": 0.0002, "epoch": 6.57516339869281, "step": 10060}, {"loss": 0.8199, "grad_norm": 1.5174646377563477, "learning_rate": 0.0002, "epoch": 6.5816993464052285, "step": 10070}, {"loss": 0.8203, "grad_norm": 1.041877031326294, "learning_rate": 0.0002, "epoch": 6.588235294117647, "step": 10080}, {"loss": 0.9684, "grad_norm": 0.9939621686935425, "learning_rate": 0.0002, "epoch": 6.594771241830065, "step": 10090}, {"loss": 0.9324, "grad_norm": 1.2706589698791504, "learning_rate": 0.0002, "epoch": 6.601307189542483, "step": 10100}, {"loss": 0.9614, "grad_norm": 1.1071467399597168, "learning_rate": 0.0002, "epoch": 6.607843137254902, "step": 10110}, {"loss": 0.9747, "grad_norm": 0.9449541568756104, "learning_rate": 0.0002, "epoch": 6.61437908496732, "step": 10120}, {"loss": 0.9557, "grad_norm": 1.0961830615997314, "learning_rate": 0.0002, "epoch": 6.620915032679738, "step": 10130}, {"loss": 0.9865, "grad_norm": 1.7726300954818726, "learning_rate": 0.0002, "epoch": 6.627450980392156, "step": 10140}, {"loss": 0.9657, "grad_norm": 1.2345516681671143, "learning_rate": 0.0002, "epoch": 6.633986928104575, "step": 10150}, {"loss": 0.9573, "grad_norm": 1.2062907218933105, "learning_rate": 0.0002, "epoch": 6.640522875816993, "step": 10160}, {"loss": 0.918, "grad_norm": 1.029327154159546, "learning_rate": 0.0002, "epoch": 6.647058823529412, "step": 10170}, {"loss": 0.9211, "grad_norm": 1.442307710647583, "learning_rate": 0.0002, "epoch": 6.65359477124183, "step": 10180}, {"loss": 0.8924, "grad_norm": 1.2579066753387451, "learning_rate": 0.0002, "epoch": 6.660130718954249, "step": 10190}, {"loss": 0.9836, "grad_norm": 1.4563188552856445, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 10200}, {"loss": 0.8876, "grad_norm": 0.9699450135231018, "learning_rate": 0.0002, "epoch": 6.673202614379085, "step": 10210}, {"loss": 0.9589, "grad_norm": 1.812523603439331, "learning_rate": 0.0002, "epoch": 6.6797385620915035, "step": 10220}, {"loss": 1.0241, "grad_norm": 1.124000906944275, "learning_rate": 0.0002, "epoch": 6.686274509803922, "step": 10230}, {"loss": 0.8924, "grad_norm": 1.0957475900650024, "learning_rate": 0.0002, "epoch": 6.69281045751634, "step": 10240}, {"loss": 0.8891, "grad_norm": 0.989689826965332, "learning_rate": 0.0002, "epoch": 6.699346405228758, "step": 10250}, {"loss": 0.9049, "grad_norm": 1.4353317022323608, "learning_rate": 0.0002, "epoch": 6.705882352941177, "step": 10260}, {"loss": 0.9311, "grad_norm": 1.0245451927185059, "learning_rate": 0.0002, "epoch": 6.712418300653595, "step": 10270}, {"loss": 0.8814, "grad_norm": 1.097334861755371, "learning_rate": 0.0002, "epoch": 6.718954248366013, "step": 10280}, {"loss": 0.9927, "grad_norm": 0.982356071472168, "learning_rate": 0.0002, "epoch": 6.7254901960784315, "step": 10290}, {"loss": 0.9909, "grad_norm": 1.8842819929122925, "learning_rate": 0.0002, "epoch": 6.73202614379085, "step": 10300}, {"loss": 0.9286, "grad_norm": 0.8648947477340698, "learning_rate": 0.0002, "epoch": 6.738562091503268, "step": 10310}, {"loss": 0.987, "grad_norm": 1.1510577201843262, "learning_rate": 0.0002, "epoch": 6.745098039215686, "step": 10320}, {"loss": 0.9217, "grad_norm": 1.874495506286621, "learning_rate": 0.0002, "epoch": 6.751633986928105, "step": 10330}, {"loss": 0.8914, "grad_norm": 1.1126408576965332, "learning_rate": 0.0002, "epoch": 6.758169934640523, "step": 10340}, {"loss": 0.8508, "grad_norm": 1.6654644012451172, "learning_rate": 0.0002, "epoch": 6.764705882352941, "step": 10350}, {"loss": 0.9653, "grad_norm": 1.0699580907821655, "learning_rate": 0.0002, "epoch": 6.771241830065359, "step": 10360}, {"loss": 0.882, "grad_norm": 0.9460757374763489, "learning_rate": 0.0002, "epoch": 6.777777777777778, "step": 10370}, {"loss": 0.9589, "grad_norm": 1.2553058862686157, "learning_rate": 0.0002, "epoch": 6.784313725490196, "step": 10380}, {"loss": 0.8782, "grad_norm": 1.0939891338348389, "learning_rate": 0.0002, "epoch": 6.790849673202614, "step": 10390}, {"loss": 0.9189, "grad_norm": 1.0647451877593994, "learning_rate": 0.0002, "epoch": 6.7973856209150325, "step": 10400}, {"loss": 0.9478, "grad_norm": 1.0954521894454956, "learning_rate": 0.0002, "epoch": 6.803921568627451, "step": 10410}, {"loss": 1.0385, "grad_norm": 1.4371392726898193, "learning_rate": 0.0002, "epoch": 6.810457516339869, "step": 10420}, {"loss": 1.0024, "grad_norm": 1.0063464641571045, "learning_rate": 0.0002, "epoch": 6.816993464052287, "step": 10430}, {"loss": 0.8737, "grad_norm": 1.5189263820648193, "learning_rate": 0.0002, "epoch": 6.823529411764706, "step": 10440}, {"loss": 0.9246, "grad_norm": 0.9715501070022583, "learning_rate": 0.0002, "epoch": 6.830065359477124, "step": 10450}, {"loss": 0.9659, "grad_norm": 1.114586353302002, "learning_rate": 0.0002, "epoch": 6.836601307189542, "step": 10460}, {"loss": 1.0081, "grad_norm": 1.2991431951522827, "learning_rate": 0.0002, "epoch": 6.8431372549019605, "step": 10470}, {"loss": 0.9323, "grad_norm": 1.203114628791809, "learning_rate": 0.0002, "epoch": 6.849673202614379, "step": 10480}, {"loss": 1.0032, "grad_norm": 1.476167917251587, "learning_rate": 0.0002, "epoch": 6.856209150326797, "step": 10490}, {"loss": 1.0275, "grad_norm": 1.0933326482772827, "learning_rate": 0.0002, "epoch": 6.862745098039216, "step": 10500}, {"loss": 1.0068, "grad_norm": 1.2831504344940186, "learning_rate": 0.0002, "epoch": 6.8692810457516345, "step": 10510}, {"loss": 0.9973, "grad_norm": 1.1967637538909912, "learning_rate": 0.0002, "epoch": 6.875816993464053, "step": 10520}, {"loss": 0.9549, "grad_norm": 1.1276888847351074, "learning_rate": 0.0002, "epoch": 6.882352941176471, "step": 10530}, {"loss": 0.9568, "grad_norm": 1.2680490016937256, "learning_rate": 0.0002, "epoch": 6.888888888888889, "step": 10540}, {"loss": 0.9177, "grad_norm": 1.5469038486480713, "learning_rate": 0.0002, "epoch": 6.895424836601308, "step": 10550}, {"loss": 0.8545, "grad_norm": 1.1731038093566895, "learning_rate": 0.0002, "epoch": 6.901960784313726, "step": 10560}, {"loss": 0.9795, "grad_norm": 0.968008816242218, "learning_rate": 0.0002, "epoch": 6.908496732026144, "step": 10570}, {"loss": 0.9439, "grad_norm": 0.9082416892051697, "learning_rate": 0.0002, "epoch": 6.915032679738562, "step": 10580}, {"loss": 0.9898, "grad_norm": 1.5816899538040161, "learning_rate": 0.0002, "epoch": 6.921568627450981, "step": 10590}, {"loss": 0.9692, "grad_norm": 0.9462234377861023, "learning_rate": 0.0002, "epoch": 6.928104575163399, "step": 10600}, {"loss": 1.0193, "grad_norm": 1.4950200319290161, "learning_rate": 0.0002, "epoch": 6.934640522875817, "step": 10610}, {"loss": 0.8888, "grad_norm": 1.2929182052612305, "learning_rate": 0.0002, "epoch": 6.9411764705882355, "step": 10620}, {"loss": 1.0141, "grad_norm": 1.2995754480361938, "learning_rate": 0.0002, "epoch": 6.947712418300654, "step": 10630}, {"loss": 0.9863, "grad_norm": 0.9407122135162354, "learning_rate": 0.0002, "epoch": 6.954248366013072, "step": 10640}, {"loss": 0.9041, "grad_norm": 1.1735378503799438, "learning_rate": 0.0002, "epoch": 6.96078431372549, "step": 10650}, {"loss": 0.936, "grad_norm": 0.9937344193458557, "learning_rate": 0.0002, "epoch": 6.967320261437909, "step": 10660}, {"loss": 0.9577, "grad_norm": 1.2498728036880493, "learning_rate": 0.0002, "epoch": 6.973856209150327, "step": 10670}, {"loss": 1.0504, "grad_norm": 1.0513341426849365, "learning_rate": 0.0002, "epoch": 6.980392156862745, "step": 10680}, {"loss": 0.9259, "grad_norm": 1.4611467123031616, "learning_rate": 0.0002, "epoch": 6.9869281045751634, "step": 10690}, {"loss": 0.9779, "grad_norm": 1.2924799919128418, "learning_rate": 0.0002, "epoch": 6.993464052287582, "step": 10700}, {"loss": 0.8953, "grad_norm": 1.2024929523468018, "learning_rate": 0.0002, "epoch": 7.0, "step": 10710}]} +{"epoch": 8.0, "step": 12240, "epoch_duration": 1811.3724415302277, "total_accumulated_duration": 13780.879077911377, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.1748046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-6120", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7451, "grad_norm": 1.5105072259902954, "learning_rate": 0.0002, "epoch": 0.006535947712418301, "step": 10}, {"loss": 3.3158, "grad_norm": 2.1156165599823, "learning_rate": 0.0002, "epoch": 0.013071895424836602, "step": 20}, {"loss": 2.643, "grad_norm": 1.0578808784484863, "learning_rate": 0.0002, "epoch": 0.0196078431372549, "step": 30}, {"loss": 2.3948, "grad_norm": 2.725064516067505, "learning_rate": 0.0002, "epoch": 0.026143790849673203, "step": 40}, {"loss": 2.3134, "grad_norm": 2.9575750827789307, "learning_rate": 0.0002, "epoch": 0.032679738562091505, "step": 50}, {"loss": 2.2778, "grad_norm": 1.2158117294311523, "learning_rate": 0.0002, "epoch": 0.0392156862745098, "step": 60}, {"loss": 1.9742, "grad_norm": 1.0850954055786133, "learning_rate": 0.0002, "epoch": 0.0457516339869281, "step": 70}, {"loss": 1.8872, "grad_norm": 1.299196720123291, "learning_rate": 0.0002, "epoch": 0.05228758169934641, "step": 80}, {"loss": 1.947, "grad_norm": 0.8310191035270691, "learning_rate": 0.0002, "epoch": 0.058823529411764705, "step": 90}, {"loss": 1.9098, "grad_norm": 0.9854435920715332, "learning_rate": 0.0002, "epoch": 0.06535947712418301, "step": 100}, {"loss": 1.7508, "grad_norm": 0.7951157689094543, "learning_rate": 0.0002, "epoch": 0.0718954248366013, "step": 110}, {"loss": 1.9035, "grad_norm": 0.7593062520027161, "learning_rate": 0.0002, "epoch": 0.0784313725490196, "step": 120}, {"loss": 1.8517, "grad_norm": 0.6783032417297363, "learning_rate": 0.0002, "epoch": 0.08496732026143791, "step": 130}, {"loss": 1.6805, "grad_norm": 0.8350756764411926, "learning_rate": 0.0002, "epoch": 0.0915032679738562, "step": 140}, {"loss": 1.6123, "grad_norm": 1.0203173160552979, "learning_rate": 0.0002, "epoch": 0.09803921568627451, "step": 150}, {"loss": 1.7248, "grad_norm": 0.8820539712905884, "learning_rate": 0.0002, "epoch": 0.10457516339869281, "step": 160}, {"loss": 1.6762, "grad_norm": 0.7286128997802734, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 170}, {"loss": 1.8841, "grad_norm": 0.7874041795730591, "learning_rate": 0.0002, "epoch": 0.11764705882352941, "step": 180}, {"loss": 1.5656, "grad_norm": 0.6630475521087646, "learning_rate": 0.0002, "epoch": 0.12418300653594772, "step": 190}, {"loss": 1.6149, "grad_norm": 0.686413586139679, "learning_rate": 0.0002, "epoch": 0.13071895424836602, "step": 200}, {"loss": 1.6227, "grad_norm": 0.7793629765510559, "learning_rate": 0.0002, "epoch": 0.13725490196078433, "step": 210}, {"loss": 1.7223, "grad_norm": 0.6893141865730286, "learning_rate": 0.0002, "epoch": 0.1437908496732026, "step": 220}, {"loss": 1.6808, "grad_norm": 0.5804724097251892, "learning_rate": 0.0002, "epoch": 0.1503267973856209, "step": 230}, {"loss": 1.5578, "grad_norm": 0.6053574085235596, "learning_rate": 0.0002, "epoch": 0.1568627450980392, "step": 240}, {"loss": 1.7394, "grad_norm": 0.7566025853157043, "learning_rate": 0.0002, "epoch": 0.16339869281045752, "step": 250}, {"loss": 1.6216, "grad_norm": 0.6112990975379944, "learning_rate": 0.0002, "epoch": 0.16993464052287582, "step": 260}, {"loss": 1.5564, "grad_norm": 0.6839066743850708, "learning_rate": 0.0002, "epoch": 0.17647058823529413, "step": 270}, {"loss": 1.7129, "grad_norm": 0.6368117928504944, "learning_rate": 0.0002, "epoch": 0.1830065359477124, "step": 280}, {"loss": 1.5646, "grad_norm": 0.6144475936889648, "learning_rate": 0.0002, "epoch": 0.1895424836601307, "step": 290}, {"loss": 1.8383, "grad_norm": 0.6743767261505127, "learning_rate": 0.0002, "epoch": 0.19607843137254902, "step": 300}, {"loss": 1.421, "grad_norm": 0.6807955503463745, "learning_rate": 0.0002, "epoch": 0.20261437908496732, "step": 310}, {"loss": 1.5961, "grad_norm": 0.6717963814735413, "learning_rate": 0.0002, "epoch": 0.20915032679738563, "step": 320}, {"loss": 1.6842, "grad_norm": 0.5917780995368958, "learning_rate": 0.0002, "epoch": 0.21568627450980393, "step": 330}, {"loss": 1.6264, "grad_norm": 0.6783658862113953, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 340}, {"loss": 1.4635, "grad_norm": 0.5820256471633911, "learning_rate": 0.0002, "epoch": 0.22875816993464052, "step": 350}, {"loss": 1.6514, "grad_norm": 0.5345938801765442, "learning_rate": 0.0002, "epoch": 0.23529411764705882, "step": 360}, {"loss": 1.6441, "grad_norm": 0.755929172039032, "learning_rate": 0.0002, "epoch": 0.24183006535947713, "step": 370}, {"loss": 1.5177, "grad_norm": 0.6183189749717712, "learning_rate": 0.0002, "epoch": 0.24836601307189543, "step": 380}, {"loss": 1.5935, "grad_norm": 0.7277782559394836, "learning_rate": 0.0002, "epoch": 0.2549019607843137, "step": 390}, {"loss": 1.6957, "grad_norm": 0.9998756051063538, "learning_rate": 0.0002, "epoch": 0.26143790849673204, "step": 400}, {"loss": 1.5738, "grad_norm": 0.7523853778839111, "learning_rate": 0.0002, "epoch": 0.2679738562091503, "step": 410}, {"loss": 1.5649, "grad_norm": 0.6548714637756348, "learning_rate": 0.0002, "epoch": 0.27450980392156865, "step": 420}, {"loss": 1.4564, "grad_norm": 0.6979796290397644, "learning_rate": 0.0002, "epoch": 0.28104575163398693, "step": 430}, {"loss": 1.5927, "grad_norm": 0.840915322303772, "learning_rate": 0.0002, "epoch": 0.2875816993464052, "step": 440}, {"loss": 1.5199, "grad_norm": 0.6142978072166443, "learning_rate": 0.0002, "epoch": 0.29411764705882354, "step": 450}, {"loss": 1.4903, "grad_norm": 0.9482691884040833, "learning_rate": 0.0002, "epoch": 0.3006535947712418, "step": 460}, {"loss": 1.6553, "grad_norm": 0.7001156806945801, "learning_rate": 0.0002, "epoch": 0.30718954248366015, "step": 470}, {"loss": 1.5957, "grad_norm": 0.6665455102920532, "learning_rate": 0.0002, "epoch": 0.3137254901960784, "step": 480}, {"loss": 1.587, "grad_norm": 0.6012697815895081, "learning_rate": 0.0002, "epoch": 0.3202614379084967, "step": 490}, {"loss": 1.4468, "grad_norm": 0.8770062327384949, "learning_rate": 0.0002, "epoch": 0.32679738562091504, "step": 500}, {"loss": 1.3558, "grad_norm": 0.7029962539672852, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 510}, {"loss": 1.4435, "grad_norm": 0.6682832837104797, "learning_rate": 0.0002, "epoch": 0.33986928104575165, "step": 520}, {"loss": 1.4242, "grad_norm": 0.5548969507217407, "learning_rate": 0.0002, "epoch": 0.3464052287581699, "step": 530}, {"loss": 1.5081, "grad_norm": 0.6640702486038208, "learning_rate": 0.0002, "epoch": 0.35294117647058826, "step": 540}, {"loss": 1.4998, "grad_norm": 0.656292200088501, "learning_rate": 0.0002, "epoch": 0.35947712418300654, "step": 550}, {"loss": 1.5415, "grad_norm": 0.618910551071167, "learning_rate": 0.0002, "epoch": 0.3660130718954248, "step": 560}, {"loss": 1.5178, "grad_norm": 0.644859790802002, "learning_rate": 0.0002, "epoch": 0.37254901960784315, "step": 570}, {"loss": 1.645, "grad_norm": 0.679042398929596, "learning_rate": 0.0002, "epoch": 0.3790849673202614, "step": 580}, {"loss": 1.5193, "grad_norm": 0.980681836605072, "learning_rate": 0.0002, "epoch": 0.38562091503267976, "step": 590}, {"loss": 1.4262, "grad_norm": 0.632219672203064, "learning_rate": 0.0002, "epoch": 0.39215686274509803, "step": 600}, {"loss": 1.5533, "grad_norm": 0.7003744840621948, "learning_rate": 0.0002, "epoch": 0.39869281045751637, "step": 610}, {"loss": 1.7747, "grad_norm": 0.7090577483177185, "learning_rate": 0.0002, "epoch": 0.40522875816993464, "step": 620}, {"loss": 1.7506, "grad_norm": 0.657819926738739, "learning_rate": 0.0002, "epoch": 0.4117647058823529, "step": 630}, {"loss": 1.621, "grad_norm": 0.7034208178520203, "learning_rate": 0.0002, "epoch": 0.41830065359477125, "step": 640}, {"loss": 1.5357, "grad_norm": 0.7274866104125977, "learning_rate": 0.0002, "epoch": 0.42483660130718953, "step": 650}, {"loss": 1.6304, "grad_norm": 0.5876233577728271, "learning_rate": 0.0002, "epoch": 0.43137254901960786, "step": 660}, {"loss": 1.7683, "grad_norm": 0.595494270324707, "learning_rate": 0.0002, "epoch": 0.43790849673202614, "step": 670}, {"loss": 1.5117, "grad_norm": 0.8253804445266724, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 680}, {"loss": 1.5199, "grad_norm": 0.652225911617279, "learning_rate": 0.0002, "epoch": 0.45098039215686275, "step": 690}, {"loss": 1.5419, "grad_norm": 0.6242014169692993, "learning_rate": 0.0002, "epoch": 0.45751633986928103, "step": 700}, {"loss": 1.53, "grad_norm": 0.7283986210823059, "learning_rate": 0.0002, "epoch": 0.46405228758169936, "step": 710}, {"loss": 1.43, "grad_norm": 0.7016081213951111, "learning_rate": 0.0002, "epoch": 0.47058823529411764, "step": 720}, {"loss": 1.4626, "grad_norm": 0.5211893916130066, "learning_rate": 0.0002, "epoch": 0.477124183006536, "step": 730}, {"loss": 1.6885, "grad_norm": 0.6221150159835815, "learning_rate": 0.0002, "epoch": 0.48366013071895425, "step": 740}, {"loss": 1.5677, "grad_norm": 0.76594477891922, "learning_rate": 0.0002, "epoch": 0.49019607843137253, "step": 750}, {"loss": 1.4982, "grad_norm": 0.5777859091758728, "learning_rate": 0.0002, "epoch": 0.49673202614379086, "step": 760}, {"loss": 1.5253, "grad_norm": 0.5793519616127014, "learning_rate": 0.0002, "epoch": 0.5032679738562091, "step": 770}, {"loss": 1.3562, "grad_norm": 0.5425786375999451, "learning_rate": 0.0002, "epoch": 0.5098039215686274, "step": 780}, {"loss": 1.3398, "grad_norm": 0.6004197001457214, "learning_rate": 0.0002, "epoch": 0.5163398692810458, "step": 790}, {"loss": 1.5346, "grad_norm": 0.7167016863822937, "learning_rate": 0.0002, "epoch": 0.5228758169934641, "step": 800}, {"loss": 1.48, "grad_norm": 0.710218071937561, "learning_rate": 0.0002, "epoch": 0.5294117647058824, "step": 810}, {"loss": 1.3943, "grad_norm": 0.699528694152832, "learning_rate": 0.0002, "epoch": 0.5359477124183006, "step": 820}, {"loss": 1.6014, "grad_norm": 0.579629123210907, "learning_rate": 0.0002, "epoch": 0.5424836601307189, "step": 830}, {"loss": 1.3894, "grad_norm": 0.595407247543335, "learning_rate": 0.0002, "epoch": 0.5490196078431373, "step": 840}, {"loss": 1.6394, "grad_norm": 0.544563889503479, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 850}, {"loss": 1.4692, "grad_norm": 0.553166389465332, "learning_rate": 0.0002, "epoch": 0.5620915032679739, "step": 860}, {"loss": 1.5155, "grad_norm": 0.5645018815994263, "learning_rate": 0.0002, "epoch": 0.5686274509803921, "step": 870}, {"loss": 1.7019, "grad_norm": 0.6576932668685913, "learning_rate": 0.0002, "epoch": 0.5751633986928104, "step": 880}, {"loss": 1.5891, "grad_norm": 0.6684197187423706, "learning_rate": 0.0002, "epoch": 0.5816993464052288, "step": 890}, {"loss": 1.5348, "grad_norm": 0.6706975698471069, "learning_rate": 0.0002, "epoch": 0.5882352941176471, "step": 900}, {"loss": 1.4038, "grad_norm": 0.6762327551841736, "learning_rate": 0.0002, "epoch": 0.5947712418300654, "step": 910}, {"loss": 1.61, "grad_norm": 0.764032244682312, "learning_rate": 0.0002, "epoch": 0.6013071895424836, "step": 920}, {"loss": 1.436, "grad_norm": 0.6996400952339172, "learning_rate": 0.0002, "epoch": 0.6078431372549019, "step": 930}, {"loss": 1.6038, "grad_norm": 0.686735987663269, "learning_rate": 0.0002, "epoch": 0.6143790849673203, "step": 940}, {"loss": 1.5194, "grad_norm": 0.6086131930351257, "learning_rate": 0.0002, "epoch": 0.6209150326797386, "step": 950}, {"loss": 1.4457, "grad_norm": 0.5627856850624084, "learning_rate": 0.0002, "epoch": 0.6274509803921569, "step": 960}, {"loss": 1.506, "grad_norm": 0.5781503319740295, "learning_rate": 0.0002, "epoch": 0.6339869281045751, "step": 970}, {"loss": 1.5668, "grad_norm": 0.6347246766090393, "learning_rate": 0.0002, "epoch": 0.6405228758169934, "step": 980}, {"loss": 1.3819, "grad_norm": 0.6581300497055054, "learning_rate": 0.0002, "epoch": 0.6470588235294118, "step": 990}, {"loss": 1.6425, "grad_norm": 0.8343676924705505, "learning_rate": 0.0002, "epoch": 0.6535947712418301, "step": 1000}, {"loss": 1.5188, "grad_norm": 0.5708910226821899, "learning_rate": 0.0002, "epoch": 0.6601307189542484, "step": 1010}, {"loss": 1.3882, "grad_norm": 0.6832585334777832, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 1020}, {"loss": 1.645, "grad_norm": 0.5767837166786194, "learning_rate": 0.0002, "epoch": 0.673202614379085, "step": 1030}, {"loss": 1.4206, "grad_norm": 0.5637745261192322, "learning_rate": 0.0002, "epoch": 0.6797385620915033, "step": 1040}, {"loss": 1.4325, "grad_norm": 0.8193050026893616, "learning_rate": 0.0002, "epoch": 0.6862745098039216, "step": 1050}, {"loss": 1.4196, "grad_norm": 0.6157439351081848, "learning_rate": 0.0002, "epoch": 0.6928104575163399, "step": 1060}, {"loss": 1.5547, "grad_norm": 0.7476664781570435, "learning_rate": 0.0002, "epoch": 0.6993464052287581, "step": 1070}, {"loss": 1.5337, "grad_norm": 0.8569361567497253, "learning_rate": 0.0002, "epoch": 0.7058823529411765, "step": 1080}, {"loss": 1.482, "grad_norm": 0.5671911835670471, "learning_rate": 0.0002, "epoch": 0.7124183006535948, "step": 1090}, {"loss": 1.5398, "grad_norm": 0.5151128768920898, "learning_rate": 0.0002, "epoch": 0.7189542483660131, "step": 1100}, {"loss": 1.4848, "grad_norm": 0.568037211894989, "learning_rate": 0.0002, "epoch": 0.7254901960784313, "step": 1110}, {"loss": 1.4708, "grad_norm": 0.6756396889686584, "learning_rate": 0.0002, "epoch": 0.7320261437908496, "step": 1120}, {"loss": 1.4017, "grad_norm": 0.638975977897644, "learning_rate": 0.0002, "epoch": 0.738562091503268, "step": 1130}, {"loss": 1.6028, "grad_norm": 0.7103341221809387, "learning_rate": 0.0002, "epoch": 0.7450980392156863, "step": 1140}, {"loss": 1.3766, "grad_norm": 0.7403952479362488, "learning_rate": 0.0002, "epoch": 0.7516339869281046, "step": 1150}, {"loss": 1.4757, "grad_norm": 0.6266511082649231, "learning_rate": 0.0002, "epoch": 0.7581699346405228, "step": 1160}, {"loss": 1.4468, "grad_norm": 0.5939070582389832, "learning_rate": 0.0002, "epoch": 0.7647058823529411, "step": 1170}, {"loss": 1.4145, "grad_norm": 0.5735430717468262, "learning_rate": 0.0002, "epoch": 0.7712418300653595, "step": 1180}, {"loss": 1.3891, "grad_norm": 0.5155234932899475, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 1190}, {"loss": 1.4942, "grad_norm": 0.5115423202514648, "learning_rate": 0.0002, "epoch": 0.7843137254901961, "step": 1200}, {"loss": 1.4508, "grad_norm": 0.693588137626648, "learning_rate": 0.0002, "epoch": 0.7908496732026143, "step": 1210}, {"loss": 1.308, "grad_norm": 0.5504693984985352, "learning_rate": 0.0002, "epoch": 0.7973856209150327, "step": 1220}, {"loss": 1.5412, "grad_norm": 0.5555992126464844, "learning_rate": 0.0002, "epoch": 0.803921568627451, "step": 1230}, {"loss": 1.5506, "grad_norm": 0.7211785316467285, "learning_rate": 0.0002, "epoch": 0.8104575163398693, "step": 1240}, {"loss": 1.6163, "grad_norm": 0.735003650188446, "learning_rate": 0.0002, "epoch": 0.8169934640522876, "step": 1250}, {"loss": 1.5836, "grad_norm": 0.5245152711868286, "learning_rate": 0.0002, "epoch": 0.8235294117647058, "step": 1260}, {"loss": 1.4505, "grad_norm": 0.5883445739746094, "learning_rate": 0.0002, "epoch": 0.8300653594771242, "step": 1270}, {"loss": 1.3642, "grad_norm": 0.6835859417915344, "learning_rate": 0.0002, "epoch": 0.8366013071895425, "step": 1280}, {"loss": 1.5526, "grad_norm": 0.6592142581939697, "learning_rate": 0.0002, "epoch": 0.8431372549019608, "step": 1290}, {"loss": 1.52, "grad_norm": 0.6087474226951599, "learning_rate": 0.0002, "epoch": 0.8496732026143791, "step": 1300}, {"loss": 1.3807, "grad_norm": 0.565387487411499, "learning_rate": 0.0002, "epoch": 0.8562091503267973, "step": 1310}, {"loss": 1.4809, "grad_norm": 0.7363151907920837, "learning_rate": 0.0002, "epoch": 0.8627450980392157, "step": 1320}, {"loss": 1.5683, "grad_norm": 0.5964524149894714, "learning_rate": 0.0002, "epoch": 0.869281045751634, "step": 1330}, {"loss": 1.3284, "grad_norm": 0.5169979929924011, "learning_rate": 0.0002, "epoch": 0.8758169934640523, "step": 1340}, {"loss": 1.6279, "grad_norm": 0.7063422799110413, "learning_rate": 0.0002, "epoch": 0.8823529411764706, "step": 1350}, {"loss": 1.3072, "grad_norm": 0.7261926531791687, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 1360}, {"loss": 1.3619, "grad_norm": 0.6759744882583618, "learning_rate": 0.0002, "epoch": 0.8954248366013072, "step": 1370}, {"loss": 1.4079, "grad_norm": 0.675051212310791, "learning_rate": 0.0002, "epoch": 0.9019607843137255, "step": 1380}, {"loss": 1.6606, "grad_norm": 0.5613595843315125, "learning_rate": 0.0002, "epoch": 0.9084967320261438, "step": 1390}, {"loss": 1.414, "grad_norm": 0.611732006072998, "learning_rate": 0.0002, "epoch": 0.9150326797385621, "step": 1400}, {"loss": 1.5766, "grad_norm": 0.6365187168121338, "learning_rate": 0.0002, "epoch": 0.9215686274509803, "step": 1410}, {"loss": 1.7832, "grad_norm": 0.7810426354408264, "learning_rate": 0.0002, "epoch": 0.9281045751633987, "step": 1420}, {"loss": 1.5377, "grad_norm": 0.593891441822052, "learning_rate": 0.0002, "epoch": 0.934640522875817, "step": 1430}, {"loss": 1.4468, "grad_norm": 0.761585533618927, "learning_rate": 0.0002, "epoch": 0.9411764705882353, "step": 1440}, {"loss": 1.589, "grad_norm": 0.6114464998245239, "learning_rate": 0.0002, "epoch": 0.9477124183006536, "step": 1450}, {"loss": 1.4973, "grad_norm": 0.601044774055481, "learning_rate": 0.0002, "epoch": 0.954248366013072, "step": 1460}, {"loss": 1.4162, "grad_norm": 0.5484876036643982, "learning_rate": 0.0002, "epoch": 0.9607843137254902, "step": 1470}, {"loss": 1.4825, "grad_norm": 0.5383428335189819, "learning_rate": 0.0002, "epoch": 0.9673202614379085, "step": 1480}, {"loss": 1.5543, "grad_norm": 0.648106575012207, "learning_rate": 0.0002, "epoch": 0.9738562091503268, "step": 1490}, {"loss": 1.3638, "grad_norm": 0.6847249865531921, "learning_rate": 0.0002, "epoch": 0.9803921568627451, "step": 1500}, {"loss": 1.4247, "grad_norm": 0.6361058354377747, "learning_rate": 0.0002, "epoch": 0.9869281045751634, "step": 1510}, {"loss": 1.5131, "grad_norm": 0.646392285823822, "learning_rate": 0.0002, "epoch": 0.9934640522875817, "step": 1520}, {"loss": 1.3738, "grad_norm": 0.5391159057617188, "learning_rate": 0.0002, "epoch": 1.0, "step": 1530}, {"eval_loss": 1.4715123176574707, "eval_runtime": 30.5701, "eval_samples_per_second": 14.262, "eval_steps_per_second": 1.799, "epoch": 1.0, "step": 1530}, {"loss": 1.4827, "grad_norm": 0.5468988418579102, "learning_rate": 0.0002, "epoch": 1.0065359477124183, "step": 1540}, {"loss": 1.4342, "grad_norm": 0.629940927028656, "learning_rate": 0.0002, "epoch": 1.0130718954248366, "step": 1550}, {"loss": 1.4259, "grad_norm": 0.6411303281784058, "learning_rate": 0.0002, "epoch": 1.0196078431372548, "step": 1560}, {"loss": 1.3924, "grad_norm": 0.5619024038314819, "learning_rate": 0.0002, "epoch": 1.026143790849673, "step": 1570}, {"loss": 1.6086, "grad_norm": 0.6093462705612183, "learning_rate": 0.0002, "epoch": 1.0326797385620916, "step": 1580}, {"loss": 1.4547, "grad_norm": 0.5543286204338074, "learning_rate": 0.0002, "epoch": 1.0392156862745099, "step": 1590}, {"loss": 1.3738, "grad_norm": 0.6079006195068359, "learning_rate": 0.0002, "epoch": 1.0457516339869282, "step": 1600}, {"loss": 1.4574, "grad_norm": 0.6240813136100769, "learning_rate": 0.0002, "epoch": 1.0522875816993464, "step": 1610}, {"loss": 1.3504, "grad_norm": 0.6141977310180664, "learning_rate": 0.0002, "epoch": 1.0588235294117647, "step": 1620}, {"loss": 1.3668, "grad_norm": 0.5920178294181824, "learning_rate": 0.0002, "epoch": 1.065359477124183, "step": 1630}, {"loss": 1.3204, "grad_norm": 0.47620782256126404, "learning_rate": 0.0002, "epoch": 1.0718954248366013, "step": 1640}, {"loss": 1.3249, "grad_norm": 0.6826292872428894, "learning_rate": 0.0002, "epoch": 1.0784313725490196, "step": 1650}, {"loss": 1.2285, "grad_norm": 0.6182006597518921, "learning_rate": 0.0002, "epoch": 1.0849673202614378, "step": 1660}, {"loss": 1.2907, "grad_norm": 0.57639479637146, "learning_rate": 0.0002, "epoch": 1.091503267973856, "step": 1670}, {"loss": 1.4575, "grad_norm": 0.6696860194206238, "learning_rate": 0.0002, "epoch": 1.0980392156862746, "step": 1680}, {"loss": 1.4104, "grad_norm": 0.699221670627594, "learning_rate": 0.0002, "epoch": 1.1045751633986929, "step": 1690}, {"loss": 1.3667, "grad_norm": 0.7138059139251709, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 1700}, {"loss": 1.3468, "grad_norm": 0.6930422186851501, "learning_rate": 0.0002, "epoch": 1.1176470588235294, "step": 1710}, {"loss": 1.5033, "grad_norm": 0.7484048008918762, "learning_rate": 0.0002, "epoch": 1.1241830065359477, "step": 1720}, {"loss": 1.4582, "grad_norm": 0.5820090174674988, "learning_rate": 0.0002, "epoch": 1.130718954248366, "step": 1730}, {"loss": 1.3704, "grad_norm": 0.7143406867980957, "learning_rate": 0.0002, "epoch": 1.1372549019607843, "step": 1740}, {"loss": 1.277, "grad_norm": 0.5597584247589111, "learning_rate": 0.0002, "epoch": 1.1437908496732025, "step": 1750}, {"loss": 1.5403, "grad_norm": 0.5171173214912415, "learning_rate": 0.0002, "epoch": 1.1503267973856208, "step": 1760}, {"loss": 1.419, "grad_norm": 0.5951920747756958, "learning_rate": 0.0002, "epoch": 1.156862745098039, "step": 1770}, {"loss": 1.2929, "grad_norm": 0.7506247758865356, "learning_rate": 0.0002, "epoch": 1.1633986928104576, "step": 1780}, {"loss": 1.5475, "grad_norm": 0.5936487913131714, "learning_rate": 0.0002, "epoch": 1.1699346405228759, "step": 1790}, {"loss": 1.3567, "grad_norm": 0.688450038433075, "learning_rate": 0.0002, "epoch": 1.1764705882352942, "step": 1800}, {"loss": 1.314, "grad_norm": 0.671623170375824, "learning_rate": 0.0002, "epoch": 1.1830065359477124, "step": 1810}, {"loss": 1.3803, "grad_norm": 0.6911860704421997, "learning_rate": 0.0002, "epoch": 1.1895424836601307, "step": 1820}, {"loss": 1.363, "grad_norm": 0.60726398229599, "learning_rate": 0.0002, "epoch": 1.196078431372549, "step": 1830}, {"loss": 1.5236, "grad_norm": 0.7542088627815247, "learning_rate": 0.0002, "epoch": 1.2026143790849673, "step": 1840}, {"loss": 1.4343, "grad_norm": 0.6810969710350037, "learning_rate": 0.0002, "epoch": 1.2091503267973855, "step": 1850}, {"loss": 1.446, "grad_norm": 0.579741895198822, "learning_rate": 0.0002, "epoch": 1.215686274509804, "step": 1860}, {"loss": 1.4564, "grad_norm": 0.9925695657730103, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 1870}, {"loss": 1.5516, "grad_norm": 0.5919767618179321, "learning_rate": 0.0002, "epoch": 1.2287581699346406, "step": 1880}, {"loss": 1.5015, "grad_norm": 0.7377090454101562, "learning_rate": 0.0002, "epoch": 1.2352941176470589, "step": 1890}, {"loss": 1.4756, "grad_norm": 0.5753688812255859, "learning_rate": 0.0002, "epoch": 1.2418300653594772, "step": 1900}, {"loss": 1.3543, "grad_norm": 0.6362486481666565, "learning_rate": 0.0002, "epoch": 1.2483660130718954, "step": 1910}, {"loss": 1.4153, "grad_norm": 0.5747467875480652, "learning_rate": 0.0002, "epoch": 1.2549019607843137, "step": 1920}, {"loss": 1.5082, "grad_norm": 0.6831939220428467, "learning_rate": 0.0002, "epoch": 1.261437908496732, "step": 1930}, {"loss": 1.3509, "grad_norm": 0.6414040327072144, "learning_rate": 0.0002, "epoch": 1.2679738562091503, "step": 1940}, {"loss": 1.5099, "grad_norm": 0.5613330006599426, "learning_rate": 0.0002, "epoch": 1.2745098039215685, "step": 1950}, {"loss": 1.377, "grad_norm": 0.5838454961776733, "learning_rate": 0.0002, "epoch": 1.2810457516339868, "step": 1960}, {"loss": 1.3548, "grad_norm": 0.5367192029953003, "learning_rate": 0.0002, "epoch": 1.287581699346405, "step": 1970}, {"loss": 1.4602, "grad_norm": 0.5829346776008606, "learning_rate": 0.0002, "epoch": 1.2941176470588236, "step": 1980}, {"loss": 1.3821, "grad_norm": 0.756534218788147, "learning_rate": 0.0002, "epoch": 1.3006535947712419, "step": 1990}, {"loss": 1.389, "grad_norm": 0.48002561926841736, "learning_rate": 0.0002, "epoch": 1.3071895424836601, "step": 2000}, {"loss": 1.256, "grad_norm": 0.5461082458496094, "learning_rate": 0.0002, "epoch": 1.3137254901960784, "step": 2010}, {"loss": 1.6257, "grad_norm": 0.570399284362793, "learning_rate": 0.0002, "epoch": 1.3202614379084967, "step": 2020}, {"loss": 1.4356, "grad_norm": 0.5130975842475891, "learning_rate": 0.0002, "epoch": 1.326797385620915, "step": 2030}, {"loss": 1.3552, "grad_norm": 0.6290071606636047, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 2040}, {"loss": 1.3873, "grad_norm": 0.6165726184844971, "learning_rate": 0.0002, "epoch": 1.3398692810457518, "step": 2050}, {"loss": 1.4376, "grad_norm": 0.5302083492279053, "learning_rate": 0.0002, "epoch": 1.34640522875817, "step": 2060}, {"loss": 1.4722, "grad_norm": 0.6531406044960022, "learning_rate": 0.0002, "epoch": 1.3529411764705883, "step": 2070}, {"loss": 1.3632, "grad_norm": 0.5981236100196838, "learning_rate": 0.0002, "epoch": 1.3594771241830066, "step": 2080}, {"loss": 1.4846, "grad_norm": 0.8534150123596191, "learning_rate": 0.0002, "epoch": 1.3660130718954249, "step": 2090}, {"loss": 1.3249, "grad_norm": 0.695918083190918, "learning_rate": 0.0002, "epoch": 1.3725490196078431, "step": 2100}, {"loss": 1.4989, "grad_norm": 0.5830431580543518, "learning_rate": 0.0002, "epoch": 1.3790849673202614, "step": 2110}, {"loss": 1.5009, "grad_norm": 0.5641306638717651, "learning_rate": 0.0002, "epoch": 1.3856209150326797, "step": 2120}, {"loss": 1.3985, "grad_norm": 0.6354436874389648, "learning_rate": 0.0002, "epoch": 1.392156862745098, "step": 2130}, {"loss": 1.2737, "grad_norm": 0.5707540512084961, "learning_rate": 0.0002, "epoch": 1.3986928104575163, "step": 2140}, {"loss": 1.3815, "grad_norm": 0.7308434844017029, "learning_rate": 0.0002, "epoch": 1.4052287581699345, "step": 2150}, {"loss": 1.3993, "grad_norm": 0.5879750847816467, "learning_rate": 0.0002, "epoch": 1.4117647058823528, "step": 2160}, {"loss": 1.3729, "grad_norm": 0.627909243106842, "learning_rate": 0.0002, "epoch": 1.4183006535947713, "step": 2170}, {"loss": 1.3391, "grad_norm": 0.5228193998336792, "learning_rate": 0.0002, "epoch": 1.4248366013071896, "step": 2180}, {"loss": 1.457, "grad_norm": 0.6162880659103394, "learning_rate": 0.0002, "epoch": 1.4313725490196079, "step": 2190}, {"loss": 1.4052, "grad_norm": 0.751610517501831, "learning_rate": 0.0002, "epoch": 1.4379084967320261, "step": 2200}, {"loss": 1.4105, "grad_norm": 0.5623487234115601, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 2210}, {"loss": 1.3795, "grad_norm": 0.5293187499046326, "learning_rate": 0.0002, "epoch": 1.4509803921568627, "step": 2220}, {"loss": 1.4247, "grad_norm": 0.5903629660606384, "learning_rate": 0.0002, "epoch": 1.457516339869281, "step": 2230}, {"loss": 1.6167, "grad_norm": 0.6084659099578857, "learning_rate": 0.0002, "epoch": 1.4640522875816995, "step": 2240}, {"loss": 1.319, "grad_norm": 0.5289803147315979, "learning_rate": 0.0002, "epoch": 1.4705882352941178, "step": 2250}, {"loss": 1.3106, "grad_norm": 0.49499568343162537, "learning_rate": 0.0002, "epoch": 1.477124183006536, "step": 2260}, {"loss": 1.3586, "grad_norm": 0.7774190306663513, "learning_rate": 0.0002, "epoch": 1.4836601307189543, "step": 2270}, {"loss": 1.3075, "grad_norm": 0.5932538509368896, "learning_rate": 0.0002, "epoch": 1.4901960784313726, "step": 2280}, {"loss": 1.3241, "grad_norm": 0.6009492874145508, "learning_rate": 0.0002, "epoch": 1.4967320261437909, "step": 2290}, {"loss": 1.3728, "grad_norm": 0.5559343099594116, "learning_rate": 0.0002, "epoch": 1.5032679738562091, "step": 2300}, {"loss": 1.2379, "grad_norm": 0.5956196188926697, "learning_rate": 0.0002, "epoch": 1.5098039215686274, "step": 2310}, {"loss": 1.5292, "grad_norm": 0.5624083876609802, "learning_rate": 0.0002, "epoch": 1.5163398692810457, "step": 2320}, {"loss": 1.4779, "grad_norm": 0.7195250391960144, "learning_rate": 0.0002, "epoch": 1.522875816993464, "step": 2330}, {"loss": 1.2938, "grad_norm": 0.6010490655899048, "learning_rate": 0.0002, "epoch": 1.5294117647058822, "step": 2340}, {"loss": 1.4121, "grad_norm": 0.664929211139679, "learning_rate": 0.0002, "epoch": 1.5359477124183005, "step": 2350}, {"loss": 1.4362, "grad_norm": 0.5158776640892029, "learning_rate": 0.0002, "epoch": 1.5424836601307188, "step": 2360}, {"loss": 1.2157, "grad_norm": 0.5147154927253723, "learning_rate": 0.0002, "epoch": 1.5490196078431373, "step": 2370}, {"loss": 1.2643, "grad_norm": 0.6507977843284607, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 2380}, {"loss": 1.2786, "grad_norm": 0.5193192362785339, "learning_rate": 0.0002, "epoch": 1.5620915032679739, "step": 2390}, {"loss": 1.3209, "grad_norm": 0.5982314944267273, "learning_rate": 0.0002, "epoch": 1.5686274509803921, "step": 2400}, {"loss": 1.3585, "grad_norm": 0.49106258153915405, "learning_rate": 0.0002, "epoch": 1.5751633986928104, "step": 2410}, {"loss": 1.3618, "grad_norm": 0.6459611654281616, "learning_rate": 0.0002, "epoch": 1.581699346405229, "step": 2420}, {"loss": 1.3305, "grad_norm": 0.7038363218307495, "learning_rate": 0.0002, "epoch": 1.5882352941176472, "step": 2430}, {"loss": 1.3198, "grad_norm": 0.5245680212974548, "learning_rate": 0.0002, "epoch": 1.5947712418300655, "step": 2440}, {"loss": 1.4756, "grad_norm": 0.6562076210975647, "learning_rate": 0.0002, "epoch": 1.6013071895424837, "step": 2450}, {"loss": 1.5635, "grad_norm": 0.6491968035697937, "learning_rate": 0.0002, "epoch": 1.607843137254902, "step": 2460}, {"loss": 1.3657, "grad_norm": 0.604034960269928, "learning_rate": 0.0002, "epoch": 1.6143790849673203, "step": 2470}, {"loss": 1.2693, "grad_norm": 0.5759671330451965, "learning_rate": 0.0002, "epoch": 1.6209150326797386, "step": 2480}, {"loss": 1.4136, "grad_norm": 0.6157698631286621, "learning_rate": 0.0002, "epoch": 1.6274509803921569, "step": 2490}, {"loss": 1.3929, "grad_norm": 0.6513794660568237, "learning_rate": 0.0002, "epoch": 1.6339869281045751, "step": 2500}, {"loss": 1.4283, "grad_norm": 0.71990966796875, "learning_rate": 0.0002, "epoch": 1.6405228758169934, "step": 2510}, {"loss": 1.4356, "grad_norm": 0.7316617369651794, "learning_rate": 0.0002, "epoch": 1.6470588235294117, "step": 2520}, {"loss": 1.3119, "grad_norm": 0.5475177764892578, "learning_rate": 0.0002, "epoch": 1.65359477124183, "step": 2530}, {"loss": 1.2998, "grad_norm": 0.4911293089389801, "learning_rate": 0.0002, "epoch": 1.6601307189542482, "step": 2540}, {"loss": 1.4198, "grad_norm": 0.6122882962226868, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 2550}, {"loss": 1.3099, "grad_norm": 0.5735281705856323, "learning_rate": 0.0002, "epoch": 1.673202614379085, "step": 2560}, {"loss": 1.2205, "grad_norm": 0.5046352744102478, "learning_rate": 0.0002, "epoch": 1.6797385620915033, "step": 2570}, {"loss": 1.3191, "grad_norm": 0.6043242812156677, "learning_rate": 0.0002, "epoch": 1.6862745098039216, "step": 2580}, {"loss": 1.3079, "grad_norm": 0.5397698283195496, "learning_rate": 0.0002, "epoch": 1.6928104575163399, "step": 2590}, {"loss": 1.4916, "grad_norm": 0.8066475987434387, "learning_rate": 0.0002, "epoch": 1.6993464052287581, "step": 2600}, {"loss": 1.3703, "grad_norm": 0.52901691198349, "learning_rate": 0.0002, "epoch": 1.7058823529411766, "step": 2610}, {"loss": 1.409, "grad_norm": 0.7588503956794739, "learning_rate": 0.0002, "epoch": 1.712418300653595, "step": 2620}, {"loss": 1.3806, "grad_norm": 0.6012966632843018, "learning_rate": 0.0002, "epoch": 1.7189542483660132, "step": 2630}, {"loss": 1.2583, "grad_norm": 0.5927302837371826, "learning_rate": 0.0002, "epoch": 1.7254901960784315, "step": 2640}, {"loss": 1.4523, "grad_norm": 0.5086990594863892, "learning_rate": 0.0002, "epoch": 1.7320261437908497, "step": 2650}, {"loss": 1.5452, "grad_norm": 0.6000628471374512, "learning_rate": 0.0002, "epoch": 1.738562091503268, "step": 2660}, {"loss": 1.3269, "grad_norm": 0.6560431718826294, "learning_rate": 0.0002, "epoch": 1.7450980392156863, "step": 2670}, {"loss": 1.3982, "grad_norm": 0.5738165378570557, "learning_rate": 0.0002, "epoch": 1.7516339869281046, "step": 2680}, {"loss": 1.3766, "grad_norm": 0.5576106905937195, "learning_rate": 0.0002, "epoch": 1.7581699346405228, "step": 2690}, {"loss": 1.3277, "grad_norm": 0.7298802137374878, "learning_rate": 0.0002, "epoch": 1.7647058823529411, "step": 2700}, {"loss": 1.2618, "grad_norm": 0.5751826167106628, "learning_rate": 0.0002, "epoch": 1.7712418300653594, "step": 2710}, {"loss": 1.35, "grad_norm": 0.6069957613945007, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 2720}, {"loss": 1.3492, "grad_norm": 0.7513017654418945, "learning_rate": 0.0002, "epoch": 1.784313725490196, "step": 2730}, {"loss": 1.2979, "grad_norm": 0.6058869957923889, "learning_rate": 0.0002, "epoch": 1.7908496732026142, "step": 2740}, {"loss": 1.299, "grad_norm": 0.6805883049964905, "learning_rate": 0.0002, "epoch": 1.7973856209150327, "step": 2750}, {"loss": 1.4062, "grad_norm": 0.6864324808120728, "learning_rate": 0.0002, "epoch": 1.803921568627451, "step": 2760}, {"loss": 1.355, "grad_norm": 0.6261002421379089, "learning_rate": 0.0002, "epoch": 1.8104575163398693, "step": 2770}, {"loss": 1.5145, "grad_norm": 0.532684862613678, "learning_rate": 0.0002, "epoch": 1.8169934640522876, "step": 2780}, {"loss": 1.3248, "grad_norm": 0.6209020018577576, "learning_rate": 0.0002, "epoch": 1.8235294117647058, "step": 2790}, {"loss": 1.3908, "grad_norm": 0.67111736536026, "learning_rate": 0.0002, "epoch": 1.8300653594771243, "step": 2800}, {"loss": 1.5088, "grad_norm": 0.700467586517334, "learning_rate": 0.0002, "epoch": 1.8366013071895426, "step": 2810}, {"loss": 1.348, "grad_norm": 0.6968029141426086, "learning_rate": 0.0002, "epoch": 1.843137254901961, "step": 2820}, {"loss": 1.3943, "grad_norm": 0.6405863761901855, "learning_rate": 0.0002, "epoch": 1.8496732026143792, "step": 2830}, {"loss": 1.4035, "grad_norm": 0.5192584991455078, "learning_rate": 0.0002, "epoch": 1.8562091503267975, "step": 2840}, {"loss": 1.2745, "grad_norm": 0.4888569414615631, "learning_rate": 0.0002, "epoch": 1.8627450980392157, "step": 2850}, {"loss": 1.4324, "grad_norm": 0.7625455856323242, "learning_rate": 0.0002, "epoch": 1.869281045751634, "step": 2860}, {"loss": 1.4989, "grad_norm": 0.9162808656692505, "learning_rate": 0.0002, "epoch": 1.8758169934640523, "step": 2870}, {"loss": 1.3978, "grad_norm": 0.5472783446311951, "learning_rate": 0.0002, "epoch": 1.8823529411764706, "step": 2880}, {"loss": 1.3026, "grad_norm": 0.5221137404441833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 2890}, {"loss": 1.33, "grad_norm": 0.49258849024772644, "learning_rate": 0.0002, "epoch": 1.8954248366013071, "step": 2900}, {"loss": 1.3503, "grad_norm": 0.5260750651359558, "learning_rate": 0.0002, "epoch": 1.9019607843137254, "step": 2910}, {"loss": 1.3381, "grad_norm": 0.6583314538002014, "learning_rate": 0.0002, "epoch": 1.9084967320261437, "step": 2920}, {"loss": 1.356, "grad_norm": 0.5728915929794312, "learning_rate": 0.0002, "epoch": 1.915032679738562, "step": 2930}, {"loss": 1.3993, "grad_norm": 0.7661453485488892, "learning_rate": 0.0002, "epoch": 1.9215686274509802, "step": 2940}, {"loss": 1.428, "grad_norm": 0.7193911075592041, "learning_rate": 0.0002, "epoch": 1.9281045751633987, "step": 2950}, {"loss": 1.287, "grad_norm": 0.5007768869400024, "learning_rate": 0.0002, "epoch": 1.934640522875817, "step": 2960}, {"loss": 1.372, "grad_norm": 0.626681923866272, "learning_rate": 0.0002, "epoch": 1.9411764705882353, "step": 2970}, {"loss": 1.375, "grad_norm": 0.8692840933799744, "learning_rate": 0.0002, "epoch": 1.9477124183006536, "step": 2980}, {"loss": 1.3292, "grad_norm": 0.6388291120529175, "learning_rate": 0.0002, "epoch": 1.954248366013072, "step": 2990}, {"loss": 1.4593, "grad_norm": 0.7710477113723755, "learning_rate": 0.0002, "epoch": 1.9607843137254903, "step": 3000}, {"loss": 1.5228, "grad_norm": 0.641704261302948, "learning_rate": 0.0002, "epoch": 1.9673202614379086, "step": 3010}, {"loss": 1.3246, "grad_norm": 0.621148943901062, "learning_rate": 0.0002, "epoch": 1.973856209150327, "step": 3020}, {"loss": 1.3017, "grad_norm": 0.5119547247886658, "learning_rate": 0.0002, "epoch": 1.9803921568627452, "step": 3030}, {"loss": 1.4923, "grad_norm": 0.8104137778282166, "learning_rate": 0.0002, "epoch": 1.9869281045751634, "step": 3040}, {"loss": 1.3331, "grad_norm": 0.5856240391731262, "learning_rate": 0.0002, "epoch": 1.9934640522875817, "step": 3050}, {"loss": 1.4346, "grad_norm": 0.5263566374778748, "learning_rate": 0.0002, "epoch": 2.0, "step": 3060}, {"eval_loss": 1.4276371002197266, "eval_runtime": 30.5759, "eval_samples_per_second": 14.26, "eval_steps_per_second": 1.799, "epoch": 2.0, "step": 3060}, {"loss": 1.1636, "grad_norm": 0.5143898725509644, "learning_rate": 0.0002, "epoch": 2.0065359477124183, "step": 3070}, {"loss": 1.3335, "grad_norm": 0.5749367475509644, "learning_rate": 0.0002, "epoch": 2.0130718954248366, "step": 3080}, {"loss": 1.2784, "grad_norm": 0.5784284472465515, "learning_rate": 0.0002, "epoch": 2.019607843137255, "step": 3090}, {"loss": 1.2463, "grad_norm": 0.5933429598808289, "learning_rate": 0.0002, "epoch": 2.026143790849673, "step": 3100}, {"loss": 1.2984, "grad_norm": 0.6748974919319153, "learning_rate": 0.0002, "epoch": 2.0326797385620914, "step": 3110}, {"loss": 1.2307, "grad_norm": 0.626399576663971, "learning_rate": 0.0002, "epoch": 2.0392156862745097, "step": 3120}, {"loss": 1.299, "grad_norm": 0.6173238754272461, "learning_rate": 0.0002, "epoch": 2.045751633986928, "step": 3130}, {"loss": 1.4144, "grad_norm": 0.807790219783783, "learning_rate": 0.0002, "epoch": 2.052287581699346, "step": 3140}, {"loss": 1.1953, "grad_norm": 0.6222215890884399, "learning_rate": 0.0002, "epoch": 2.0588235294117645, "step": 3150}, {"loss": 1.4059, "grad_norm": 0.5859580636024475, "learning_rate": 0.0002, "epoch": 2.065359477124183, "step": 3160}, {"loss": 1.3607, "grad_norm": 0.581304132938385, "learning_rate": 0.0002, "epoch": 2.0718954248366015, "step": 3170}, {"loss": 1.1212, "grad_norm": 0.9814971089363098, "learning_rate": 0.0002, "epoch": 2.0784313725490198, "step": 3180}, {"loss": 1.1962, "grad_norm": 0.6491848230361938, "learning_rate": 0.0002, "epoch": 2.084967320261438, "step": 3190}, {"loss": 1.3711, "grad_norm": 0.613680362701416, "learning_rate": 0.0002, "epoch": 2.0915032679738563, "step": 3200}, {"loss": 1.2994, "grad_norm": 0.7318086624145508, "learning_rate": 0.0002, "epoch": 2.0980392156862746, "step": 3210}, {"loss": 1.2502, "grad_norm": 0.6025661826133728, "learning_rate": 0.0002, "epoch": 2.104575163398693, "step": 3220}, {"loss": 1.1374, "grad_norm": 0.6744484305381775, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 3230}, {"loss": 1.3273, "grad_norm": 0.6062554121017456, "learning_rate": 0.0002, "epoch": 2.1176470588235294, "step": 3240}, {"loss": 1.3404, "grad_norm": 0.6801803112030029, "learning_rate": 0.0002, "epoch": 2.1241830065359477, "step": 3250}, {"loss": 1.4084, "grad_norm": 0.5218925476074219, "learning_rate": 0.0002, "epoch": 2.130718954248366, "step": 3260}, {"loss": 1.2867, "grad_norm": 0.7494263648986816, "learning_rate": 0.0002, "epoch": 2.1372549019607843, "step": 3270}, {"loss": 1.3059, "grad_norm": 0.7858565449714661, "learning_rate": 0.0002, "epoch": 2.1437908496732025, "step": 3280}, {"loss": 1.3214, "grad_norm": 0.6836692690849304, "learning_rate": 0.0002, "epoch": 2.150326797385621, "step": 3290}, {"loss": 1.1605, "grad_norm": 0.619848370552063, "learning_rate": 0.0002, "epoch": 2.156862745098039, "step": 3300}, {"loss": 1.3095, "grad_norm": 0.5761294364929199, "learning_rate": 0.0002, "epoch": 2.1633986928104574, "step": 3310}, {"loss": 1.2883, "grad_norm": 0.4713786542415619, "learning_rate": 0.0002, "epoch": 2.1699346405228757, "step": 3320}, {"loss": 1.3817, "grad_norm": 0.7613773345947266, "learning_rate": 0.0002, "epoch": 2.176470588235294, "step": 3330}, {"loss": 1.2354, "grad_norm": 0.6642718315124512, "learning_rate": 0.0002, "epoch": 2.183006535947712, "step": 3340}, {"loss": 1.2048, "grad_norm": 0.7162188291549683, "learning_rate": 0.0002, "epoch": 2.189542483660131, "step": 3350}, {"loss": 1.3886, "grad_norm": 0.6916783452033997, "learning_rate": 0.0002, "epoch": 2.196078431372549, "step": 3360}, {"loss": 1.3788, "grad_norm": 0.7205567955970764, "learning_rate": 0.0002, "epoch": 2.2026143790849675, "step": 3370}, {"loss": 1.2528, "grad_norm": 0.6038199067115784, "learning_rate": 0.0002, "epoch": 2.2091503267973858, "step": 3380}, {"loss": 1.2079, "grad_norm": 0.6284233927726746, "learning_rate": 0.0002, "epoch": 2.215686274509804, "step": 3390}, {"loss": 1.3057, "grad_norm": 0.7450672388076782, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3400}, {"loss": 1.3034, "grad_norm": 0.7755052447319031, "learning_rate": 0.0002, "epoch": 2.2287581699346406, "step": 3410}, {"loss": 1.2953, "grad_norm": 0.9066099524497986, "learning_rate": 0.0002, "epoch": 2.235294117647059, "step": 3420}, {"loss": 1.3072, "grad_norm": 0.8578207492828369, "learning_rate": 0.0002, "epoch": 2.241830065359477, "step": 3430}, {"loss": 1.3278, "grad_norm": 0.5900213718414307, "learning_rate": 0.0002, "epoch": 2.2483660130718954, "step": 3440}, {"loss": 1.3645, "grad_norm": 0.7821717262268066, "learning_rate": 0.0002, "epoch": 2.2549019607843137, "step": 3450}, {"loss": 1.183, "grad_norm": 0.6263150572776794, "learning_rate": 0.0002, "epoch": 2.261437908496732, "step": 3460}, {"loss": 1.178, "grad_norm": 0.591799259185791, "learning_rate": 0.0002, "epoch": 2.2679738562091503, "step": 3470}, {"loss": 1.2198, "grad_norm": 0.5999799966812134, "learning_rate": 0.0002, "epoch": 2.2745098039215685, "step": 3480}, {"loss": 1.2724, "grad_norm": 0.6227319240570068, "learning_rate": 0.0002, "epoch": 2.281045751633987, "step": 3490}, {"loss": 1.3865, "grad_norm": 0.719412624835968, "learning_rate": 0.0002, "epoch": 2.287581699346405, "step": 3500}, {"loss": 1.3275, "grad_norm": 1.0361769199371338, "learning_rate": 0.0002, "epoch": 2.2941176470588234, "step": 3510}, {"loss": 1.4834, "grad_norm": 0.5506668090820312, "learning_rate": 0.0002, "epoch": 2.3006535947712417, "step": 3520}, {"loss": 1.2273, "grad_norm": 0.6886829733848572, "learning_rate": 0.0002, "epoch": 2.30718954248366, "step": 3530}, {"loss": 1.2296, "grad_norm": 0.6226346492767334, "learning_rate": 0.0002, "epoch": 2.313725490196078, "step": 3540}, {"loss": 1.3087, "grad_norm": 0.8109908103942871, "learning_rate": 0.0002, "epoch": 2.3202614379084965, "step": 3550}, {"loss": 1.3311, "grad_norm": 0.8505511283874512, "learning_rate": 0.0002, "epoch": 2.326797385620915, "step": 3560}, {"loss": 1.2526, "grad_norm": 0.5763760209083557, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 3570}, {"loss": 1.4135, "grad_norm": 0.6460059881210327, "learning_rate": 0.0002, "epoch": 2.3398692810457518, "step": 3580}, {"loss": 1.2701, "grad_norm": 0.7175343036651611, "learning_rate": 0.0002, "epoch": 2.34640522875817, "step": 3590}, {"loss": 1.2645, "grad_norm": 0.6012630462646484, "learning_rate": 0.0002, "epoch": 2.3529411764705883, "step": 3600}, {"loss": 1.3214, "grad_norm": 0.6513685584068298, "learning_rate": 0.0002, "epoch": 2.3594771241830066, "step": 3610}, {"loss": 1.3271, "grad_norm": 0.7465183734893799, "learning_rate": 0.0002, "epoch": 2.366013071895425, "step": 3620}, {"loss": 1.3671, "grad_norm": 0.6413124203681946, "learning_rate": 0.0002, "epoch": 2.372549019607843, "step": 3630}, {"loss": 1.4026, "grad_norm": 0.7209562063217163, "learning_rate": 0.0002, "epoch": 2.3790849673202614, "step": 3640}, {"loss": 1.1616, "grad_norm": 0.6427558660507202, "learning_rate": 0.0002, "epoch": 2.3856209150326797, "step": 3650}, {"loss": 1.313, "grad_norm": 0.593958854675293, "learning_rate": 0.0002, "epoch": 2.392156862745098, "step": 3660}, {"loss": 1.2802, "grad_norm": 0.5944608449935913, "learning_rate": 0.0002, "epoch": 2.3986928104575163, "step": 3670}, {"loss": 1.3542, "grad_norm": 0.6606248617172241, "learning_rate": 0.0002, "epoch": 2.4052287581699345, "step": 3680}, {"loss": 1.2977, "grad_norm": 0.5632851719856262, "learning_rate": 0.0002, "epoch": 2.411764705882353, "step": 3690}, {"loss": 1.2032, "grad_norm": 0.4976513385772705, "learning_rate": 0.0002, "epoch": 2.418300653594771, "step": 3700}, {"loss": 1.1404, "grad_norm": 0.6318528056144714, "learning_rate": 0.0002, "epoch": 2.4248366013071894, "step": 3710}, {"loss": 1.1705, "grad_norm": 0.6306707859039307, "learning_rate": 0.0002, "epoch": 2.431372549019608, "step": 3720}, {"loss": 1.3524, "grad_norm": 0.6362553238868713, "learning_rate": 0.0002, "epoch": 2.4379084967320264, "step": 3730}, {"loss": 1.2345, "grad_norm": 0.634368896484375, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 3740}, {"loss": 1.2515, "grad_norm": 0.6623591184616089, "learning_rate": 0.0002, "epoch": 2.450980392156863, "step": 3750}, {"loss": 1.3246, "grad_norm": 0.6150440573692322, "learning_rate": 0.0002, "epoch": 2.457516339869281, "step": 3760}, {"loss": 1.2666, "grad_norm": 0.588935911655426, "learning_rate": 0.0002, "epoch": 2.4640522875816995, "step": 3770}, {"loss": 1.3918, "grad_norm": 0.7388206124305725, "learning_rate": 0.0002, "epoch": 2.4705882352941178, "step": 3780}, {"loss": 1.2512, "grad_norm": 0.621825098991394, "learning_rate": 0.0002, "epoch": 2.477124183006536, "step": 3790}, {"loss": 1.359, "grad_norm": 0.7691677212715149, "learning_rate": 0.0002, "epoch": 2.4836601307189543, "step": 3800}, {"loss": 1.3399, "grad_norm": 1.1661969423294067, "learning_rate": 0.0002, "epoch": 2.4901960784313726, "step": 3810}, {"loss": 1.461, "grad_norm": 0.6837884187698364, "learning_rate": 0.0002, "epoch": 2.496732026143791, "step": 3820}, {"loss": 1.2823, "grad_norm": 0.6978904008865356, "learning_rate": 0.0002, "epoch": 2.503267973856209, "step": 3830}, {"loss": 1.3688, "grad_norm": 0.6121411323547363, "learning_rate": 0.0002, "epoch": 2.5098039215686274, "step": 3840}, {"loss": 1.2587, "grad_norm": 0.7813326120376587, "learning_rate": 0.0002, "epoch": 2.5163398692810457, "step": 3850}, {"loss": 1.1543, "grad_norm": 0.5390260219573975, "learning_rate": 0.0002, "epoch": 2.522875816993464, "step": 3860}, {"loss": 1.2032, "grad_norm": 0.8283252716064453, "learning_rate": 0.0002, "epoch": 2.5294117647058822, "step": 3870}, {"loss": 1.3112, "grad_norm": 0.8527186512947083, "learning_rate": 0.0002, "epoch": 2.5359477124183005, "step": 3880}, {"loss": 1.3469, "grad_norm": 0.8405382633209229, "learning_rate": 0.0002, "epoch": 2.542483660130719, "step": 3890}, {"loss": 1.1801, "grad_norm": 0.5650738477706909, "learning_rate": 0.0002, "epoch": 2.549019607843137, "step": 3900}, {"loss": 1.2917, "grad_norm": 0.620121955871582, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 3910}, {"loss": 1.2524, "grad_norm": 0.5983527898788452, "learning_rate": 0.0002, "epoch": 2.5620915032679736, "step": 3920}, {"loss": 1.4408, "grad_norm": 0.686623215675354, "learning_rate": 0.0002, "epoch": 2.568627450980392, "step": 3930}, {"loss": 1.186, "grad_norm": 0.6805831789970398, "learning_rate": 0.0002, "epoch": 2.57516339869281, "step": 3940}, {"loss": 1.367, "grad_norm": 0.6994825601577759, "learning_rate": 0.0002, "epoch": 2.581699346405229, "step": 3950}, {"loss": 1.3446, "grad_norm": 0.728549599647522, "learning_rate": 0.0002, "epoch": 2.588235294117647, "step": 3960}, {"loss": 1.4039, "grad_norm": 0.775236964225769, "learning_rate": 0.0002, "epoch": 2.5947712418300655, "step": 3970}, {"loss": 1.2742, "grad_norm": 0.5057447552680969, "learning_rate": 0.0002, "epoch": 2.6013071895424837, "step": 3980}, {"loss": 1.2764, "grad_norm": 0.6564450263977051, "learning_rate": 0.0002, "epoch": 2.607843137254902, "step": 3990}, {"loss": 1.3269, "grad_norm": 0.5342249870300293, "learning_rate": 0.0002, "epoch": 2.6143790849673203, "step": 4000}, {"loss": 1.3102, "grad_norm": 0.5508961081504822, "learning_rate": 0.0002, "epoch": 2.6209150326797386, "step": 4010}, {"loss": 1.3636, "grad_norm": 0.5716235637664795, "learning_rate": 0.0002, "epoch": 2.627450980392157, "step": 4020}, {"loss": 1.3465, "grad_norm": 0.8049232363700867, "learning_rate": 0.0002, "epoch": 2.633986928104575, "step": 4030}, {"loss": 1.2342, "grad_norm": 0.5574354529380798, "learning_rate": 0.0002, "epoch": 2.6405228758169934, "step": 4040}, {"loss": 1.2419, "grad_norm": 0.6302093863487244, "learning_rate": 0.0002, "epoch": 2.6470588235294117, "step": 4050}, {"loss": 1.2565, "grad_norm": 1.1868736743927002, "learning_rate": 0.0002, "epoch": 2.65359477124183, "step": 4060}, {"loss": 1.1382, "grad_norm": 0.6738120317459106, "learning_rate": 0.0002, "epoch": 2.6601307189542482, "step": 4070}, {"loss": 1.2456, "grad_norm": 0.6614423990249634, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 4080}, {"loss": 1.2958, "grad_norm": 0.7297604084014893, "learning_rate": 0.0002, "epoch": 2.6732026143790852, "step": 4090}, {"loss": 1.1596, "grad_norm": 0.9421682357788086, "learning_rate": 0.0002, "epoch": 2.6797385620915035, "step": 4100}, {"loss": 1.3002, "grad_norm": 0.5286222696304321, "learning_rate": 0.0002, "epoch": 2.686274509803922, "step": 4110}, {"loss": 1.3936, "grad_norm": 0.6849271655082703, "learning_rate": 0.0002, "epoch": 2.69281045751634, "step": 4120}, {"loss": 1.2721, "grad_norm": 0.6811320185661316, "learning_rate": 0.0002, "epoch": 2.6993464052287583, "step": 4130}, {"loss": 1.2897, "grad_norm": 0.4968419373035431, "learning_rate": 0.0002, "epoch": 2.7058823529411766, "step": 4140}, {"loss": 1.3322, "grad_norm": 0.8074267506599426, "learning_rate": 0.0002, "epoch": 2.712418300653595, "step": 4150}, {"loss": 1.1759, "grad_norm": 0.6756376028060913, "learning_rate": 0.0002, "epoch": 2.718954248366013, "step": 4160}, {"loss": 1.2444, "grad_norm": 0.6921583414077759, "learning_rate": 0.0002, "epoch": 2.7254901960784315, "step": 4170}, {"loss": 1.3413, "grad_norm": 0.7049834132194519, "learning_rate": 0.0002, "epoch": 2.7320261437908497, "step": 4180}, {"loss": 1.1965, "grad_norm": 0.7011390328407288, "learning_rate": 0.0002, "epoch": 2.738562091503268, "step": 4190}, {"loss": 1.2364, "grad_norm": 0.6977843642234802, "learning_rate": 0.0002, "epoch": 2.7450980392156863, "step": 4200}, {"loss": 1.2533, "grad_norm": 0.6717000603675842, "learning_rate": 0.0002, "epoch": 2.7516339869281046, "step": 4210}, {"loss": 1.392, "grad_norm": 1.0223724842071533, "learning_rate": 0.0002, "epoch": 2.758169934640523, "step": 4220}, {"loss": 1.2451, "grad_norm": 0.6573330760002136, "learning_rate": 0.0002, "epoch": 2.764705882352941, "step": 4230}, {"loss": 1.4219, "grad_norm": 0.6684938073158264, "learning_rate": 0.0002, "epoch": 2.7712418300653594, "step": 4240}, {"loss": 1.2505, "grad_norm": 0.7426793575286865, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 4250}, {"loss": 1.2904, "grad_norm": 0.557826578617096, "learning_rate": 0.0002, "epoch": 2.784313725490196, "step": 4260}, {"loss": 1.3262, "grad_norm": 0.6669870018959045, "learning_rate": 0.0002, "epoch": 2.7908496732026142, "step": 4270}, {"loss": 1.2369, "grad_norm": 0.5349969267845154, "learning_rate": 0.0002, "epoch": 2.7973856209150325, "step": 4280}, {"loss": 1.3769, "grad_norm": 0.7262802124023438, "learning_rate": 0.0002, "epoch": 2.803921568627451, "step": 4290}, {"loss": 1.3373, "grad_norm": 0.768211841583252, "learning_rate": 0.0002, "epoch": 2.810457516339869, "step": 4300}, {"loss": 1.2444, "grad_norm": 0.5958252549171448, "learning_rate": 0.0002, "epoch": 2.8169934640522873, "step": 4310}, {"loss": 1.4113, "grad_norm": 0.8451310396194458, "learning_rate": 0.0002, "epoch": 2.8235294117647056, "step": 4320}, {"loss": 1.2454, "grad_norm": 0.6544435024261475, "learning_rate": 0.0002, "epoch": 2.8300653594771243, "step": 4330}, {"loss": 1.2777, "grad_norm": 0.6177433133125305, "learning_rate": 0.0002, "epoch": 2.8366013071895426, "step": 4340}, {"loss": 1.2562, "grad_norm": 0.6324988007545471, "learning_rate": 0.0002, "epoch": 2.843137254901961, "step": 4350}, {"loss": 1.4117, "grad_norm": 0.6884300708770752, "learning_rate": 0.0002, "epoch": 2.849673202614379, "step": 4360}, {"loss": 1.2391, "grad_norm": 0.8952897191047668, "learning_rate": 0.0002, "epoch": 2.8562091503267975, "step": 4370}, {"loss": 1.2814, "grad_norm": 1.0260103940963745, "learning_rate": 0.0002, "epoch": 2.8627450980392157, "step": 4380}, {"loss": 1.2893, "grad_norm": 0.9134647250175476, "learning_rate": 0.0002, "epoch": 2.869281045751634, "step": 4390}, {"loss": 1.171, "grad_norm": 0.5637717843055725, "learning_rate": 0.0002, "epoch": 2.8758169934640523, "step": 4400}, {"loss": 1.3422, "grad_norm": 0.7530393004417419, "learning_rate": 0.0002, "epoch": 2.8823529411764706, "step": 4410}, {"loss": 1.29, "grad_norm": 0.7202680706977844, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 4420}, {"loss": 1.2913, "grad_norm": 0.7177144885063171, "learning_rate": 0.0002, "epoch": 2.895424836601307, "step": 4430}, {"loss": 1.1922, "grad_norm": 0.5996816754341125, "learning_rate": 0.0002, "epoch": 2.9019607843137254, "step": 4440}, {"loss": 1.4816, "grad_norm": 0.6542447209358215, "learning_rate": 0.0002, "epoch": 2.9084967320261437, "step": 4450}, {"loss": 1.503, "grad_norm": 1.0753740072250366, "learning_rate": 0.0002, "epoch": 2.915032679738562, "step": 4460}, {"loss": 1.3193, "grad_norm": 0.6956136226654053, "learning_rate": 0.0002, "epoch": 2.9215686274509802, "step": 4470}, {"loss": 1.2486, "grad_norm": 0.7702530026435852, "learning_rate": 0.0002, "epoch": 2.928104575163399, "step": 4480}, {"loss": 1.3371, "grad_norm": 0.7763232588768005, "learning_rate": 0.0002, "epoch": 2.9346405228758172, "step": 4490}, {"loss": 1.1647, "grad_norm": 0.6393085718154907, "learning_rate": 0.0002, "epoch": 2.9411764705882355, "step": 4500}, {"loss": 1.211, "grad_norm": 0.987770676612854, "learning_rate": 0.0002, "epoch": 2.947712418300654, "step": 4510}, {"loss": 1.1529, "grad_norm": 0.5995016098022461, "learning_rate": 0.0002, "epoch": 2.954248366013072, "step": 4520}, {"loss": 1.2358, "grad_norm": 0.745650053024292, "learning_rate": 0.0002, "epoch": 2.9607843137254903, "step": 4530}, {"loss": 1.2115, "grad_norm": 0.7429282069206238, "learning_rate": 0.0002, "epoch": 2.9673202614379086, "step": 4540}, {"loss": 1.2262, "grad_norm": 0.5927486419677734, "learning_rate": 0.0002, "epoch": 2.973856209150327, "step": 4550}, {"loss": 1.3173, "grad_norm": 0.6775153875350952, "learning_rate": 0.0002, "epoch": 2.980392156862745, "step": 4560}, {"loss": 1.279, "grad_norm": 0.7128435373306274, "learning_rate": 0.0002, "epoch": 2.9869281045751634, "step": 4570}, {"loss": 1.2451, "grad_norm": 0.7470937967300415, "learning_rate": 0.0002, "epoch": 2.9934640522875817, "step": 4580}, {"loss": 1.2701, "grad_norm": 0.9295375943183899, "learning_rate": 0.0002, "epoch": 3.0, "step": 4590}, {"eval_loss": 1.4131312370300293, "eval_runtime": 31.8967, "eval_samples_per_second": 13.669, "eval_steps_per_second": 1.724, "epoch": 3.0, "step": 4590}, {"loss": 1.1283, "grad_norm": 0.6926420331001282, "learning_rate": 0.0002, "epoch": 3.0065359477124183, "step": 4600}, {"loss": 1.1537, "grad_norm": 0.6656355857849121, "learning_rate": 0.0002, "epoch": 3.0130718954248366, "step": 4610}, {"loss": 1.308, "grad_norm": 0.9901936650276184, "learning_rate": 0.0002, "epoch": 3.019607843137255, "step": 4620}, {"loss": 1.22, "grad_norm": 0.6713474988937378, "learning_rate": 0.0002, "epoch": 3.026143790849673, "step": 4630}, {"loss": 1.2249, "grad_norm": 0.6199324131011963, "learning_rate": 0.0002, "epoch": 3.0326797385620914, "step": 4640}, {"loss": 1.242, "grad_norm": 0.7180785536766052, "learning_rate": 0.0002, "epoch": 3.0392156862745097, "step": 4650}, {"loss": 1.1349, "grad_norm": 0.8256588578224182, "learning_rate": 0.0002, "epoch": 3.045751633986928, "step": 4660}, {"loss": 1.1431, "grad_norm": 0.6637389063835144, "learning_rate": 0.0002, "epoch": 3.052287581699346, "step": 4670}, {"loss": 1.1096, "grad_norm": 0.6980698108673096, "learning_rate": 0.0002, "epoch": 3.0588235294117645, "step": 4680}, {"loss": 1.196, "grad_norm": 0.8091534972190857, "learning_rate": 0.0002, "epoch": 3.065359477124183, "step": 4690}, {"loss": 1.1652, "grad_norm": 0.5715174078941345, "learning_rate": 0.0002, "epoch": 3.0718954248366015, "step": 4700}, {"loss": 1.1427, "grad_norm": 0.735639750957489, "learning_rate": 0.0002, "epoch": 3.0784313725490198, "step": 4710}, {"loss": 1.1522, "grad_norm": 0.7619708180427551, "learning_rate": 0.0002, "epoch": 3.084967320261438, "step": 4720}, {"loss": 1.0853, "grad_norm": 1.263566017150879, "learning_rate": 0.0002, "epoch": 3.0915032679738563, "step": 4730}, {"loss": 1.1348, "grad_norm": 0.6600871682167053, "learning_rate": 0.0002, "epoch": 3.0980392156862746, "step": 4740}, {"loss": 1.1766, "grad_norm": 0.717792809009552, "learning_rate": 0.0002, "epoch": 3.104575163398693, "step": 4750}, {"loss": 1.088, "grad_norm": 0.853714644908905, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 4760}, {"loss": 1.2031, "grad_norm": 1.1004153490066528, "learning_rate": 0.0002, "epoch": 3.1176470588235294, "step": 4770}, {"loss": 1.3295, "grad_norm": 0.8566235899925232, "learning_rate": 0.0002, "epoch": 3.1241830065359477, "step": 4780}, {"loss": 1.2436, "grad_norm": 0.8315296173095703, "learning_rate": 0.0002, "epoch": 3.130718954248366, "step": 4790}, {"loss": 1.32, "grad_norm": 0.8020524978637695, "learning_rate": 0.0002, "epoch": 3.1372549019607843, "step": 4800}, {"loss": 1.1238, "grad_norm": 0.7564275860786438, "learning_rate": 0.0002, "epoch": 3.1437908496732025, "step": 4810}, {"loss": 1.1244, "grad_norm": 0.9077776670455933, "learning_rate": 0.0002, "epoch": 3.150326797385621, "step": 4820}, {"loss": 1.1399, "grad_norm": 0.6323099732398987, "learning_rate": 0.0002, "epoch": 3.156862745098039, "step": 4830}, {"loss": 1.1983, "grad_norm": 0.6625368595123291, "learning_rate": 0.0002, "epoch": 3.1633986928104574, "step": 4840}, {"loss": 1.066, "grad_norm": 0.8119261860847473, "learning_rate": 0.0002, "epoch": 3.1699346405228757, "step": 4850}, {"loss": 1.0224, "grad_norm": 0.6399450898170471, "learning_rate": 0.0002, "epoch": 3.176470588235294, "step": 4860}, {"loss": 1.2181, "grad_norm": 1.0659016370773315, "learning_rate": 0.0002, "epoch": 3.183006535947712, "step": 4870}, {"loss": 1.2914, "grad_norm": 0.8040369749069214, "learning_rate": 0.0002, "epoch": 3.189542483660131, "step": 4880}, {"loss": 1.1996, "grad_norm": 0.7784733176231384, "learning_rate": 0.0002, "epoch": 3.196078431372549, "step": 4890}, {"loss": 1.2051, "grad_norm": 0.9660294651985168, "learning_rate": 0.0002, "epoch": 3.2026143790849675, "step": 4900}, {"loss": 1.0419, "grad_norm": 1.0676977634429932, "learning_rate": 0.0002, "epoch": 3.2091503267973858, "step": 4910}, {"loss": 1.0083, "grad_norm": 0.5877565741539001, "learning_rate": 0.0002, "epoch": 3.215686274509804, "step": 4920}, {"loss": 1.1046, "grad_norm": 0.6164032816886902, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 4930}, {"loss": 1.1079, "grad_norm": 0.7627606987953186, "learning_rate": 0.0002, "epoch": 3.2287581699346406, "step": 4940}, {"loss": 1.2453, "grad_norm": 0.7442803978919983, "learning_rate": 0.0002, "epoch": 3.235294117647059, "step": 4950}, {"loss": 1.1087, "grad_norm": 0.7277812361717224, "learning_rate": 0.0002, "epoch": 3.241830065359477, "step": 4960}, {"loss": 1.2237, "grad_norm": 1.0301902294158936, "learning_rate": 0.0002, "epoch": 3.2483660130718954, "step": 4970}, {"loss": 1.1466, "grad_norm": 0.7798232436180115, "learning_rate": 0.0002, "epoch": 3.2549019607843137, "step": 4980}, {"loss": 1.2142, "grad_norm": 1.210265874862671, "learning_rate": 0.0002, "epoch": 3.261437908496732, "step": 4990}, {"loss": 1.1557, "grad_norm": 0.6677713990211487, "learning_rate": 0.0002, "epoch": 3.2679738562091503, "step": 5000}, {"loss": 1.3294, "grad_norm": 1.0524500608444214, "learning_rate": 0.0002, "epoch": 3.2745098039215685, "step": 5010}, {"loss": 1.1939, "grad_norm": 0.7091745734214783, "learning_rate": 0.0002, "epoch": 3.281045751633987, "step": 5020}, {"loss": 1.1891, "grad_norm": 0.8523224592208862, "learning_rate": 0.0002, "epoch": 3.287581699346405, "step": 5030}, {"loss": 1.1925, "grad_norm": 0.6120608448982239, "learning_rate": 0.0002, "epoch": 3.2941176470588234, "step": 5040}, {"loss": 1.0603, "grad_norm": 0.7437472939491272, "learning_rate": 0.0002, "epoch": 3.3006535947712417, "step": 5050}, {"loss": 1.1295, "grad_norm": 0.7611715197563171, "learning_rate": 0.0002, "epoch": 3.30718954248366, "step": 5060}, {"loss": 1.0531, "grad_norm": 0.7249704003334045, "learning_rate": 0.0002, "epoch": 3.313725490196078, "step": 5070}, {"loss": 1.2292, "grad_norm": 0.7316247820854187, "learning_rate": 0.0002, "epoch": 3.3202614379084965, "step": 5080}, {"loss": 1.1974, "grad_norm": 0.562412440776825, "learning_rate": 0.0002, "epoch": 3.326797385620915, "step": 5090}, {"loss": 1.0736, "grad_norm": 0.7052176594734192, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 5100}, {"loss": 1.122, "grad_norm": 0.7714211344718933, "learning_rate": 0.0002, "epoch": 3.3398692810457518, "step": 5110}, {"loss": 1.1684, "grad_norm": 1.0436055660247803, "learning_rate": 0.0002, "epoch": 3.34640522875817, "step": 5120}, {"loss": 1.0945, "grad_norm": 0.8867271542549133, "learning_rate": 0.0002, "epoch": 3.3529411764705883, "step": 5130}, {"loss": 1.159, "grad_norm": 0.8371267914772034, "learning_rate": 0.0002, "epoch": 3.3594771241830066, "step": 5140}, {"loss": 1.1073, "grad_norm": 0.7257837057113647, "learning_rate": 0.0002, "epoch": 3.366013071895425, "step": 5150}, {"loss": 1.1162, "grad_norm": 0.7102002501487732, "learning_rate": 0.0002, "epoch": 3.372549019607843, "step": 5160}, {"loss": 1.2056, "grad_norm": 0.7636350393295288, "learning_rate": 0.0002, "epoch": 3.3790849673202614, "step": 5170}, {"loss": 1.0708, "grad_norm": 0.6887359619140625, "learning_rate": 0.0002, "epoch": 3.3856209150326797, "step": 5180}, {"loss": 1.3807, "grad_norm": 0.8141424655914307, "learning_rate": 0.0002, "epoch": 3.392156862745098, "step": 5190}, {"loss": 1.1986, "grad_norm": 0.694423496723175, "learning_rate": 0.0002, "epoch": 3.3986928104575163, "step": 5200}, {"loss": 1.2945, "grad_norm": 0.914013683795929, "learning_rate": 0.0002, "epoch": 3.4052287581699345, "step": 5210}, {"loss": 1.1413, "grad_norm": 0.8503239750862122, "learning_rate": 0.0002, "epoch": 3.411764705882353, "step": 5220}, {"loss": 1.2696, "grad_norm": 0.6196836233139038, "learning_rate": 0.0002, "epoch": 3.418300653594771, "step": 5230}, {"loss": 1.2431, "grad_norm": 1.0760811567306519, "learning_rate": 0.0002, "epoch": 3.4248366013071894, "step": 5240}, {"loss": 1.1686, "grad_norm": 0.6524698138237, "learning_rate": 0.0002, "epoch": 3.431372549019608, "step": 5250}, {"loss": 1.2012, "grad_norm": 0.674467921257019, "learning_rate": 0.0002, "epoch": 3.4379084967320264, "step": 5260}, {"loss": 1.1015, "grad_norm": 0.7690372467041016, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 5270}, {"loss": 1.2511, "grad_norm": 0.8751813769340515, "learning_rate": 0.0002, "epoch": 3.450980392156863, "step": 5280}, {"loss": 1.1841, "grad_norm": 0.750407874584198, "learning_rate": 0.0002, "epoch": 3.457516339869281, "step": 5290}, {"loss": 1.0605, "grad_norm": 0.5991823077201843, "learning_rate": 0.0002, "epoch": 3.4640522875816995, "step": 5300}, {"loss": 1.2347, "grad_norm": 1.0164772272109985, "learning_rate": 0.0002, "epoch": 3.4705882352941178, "step": 5310}, {"loss": 1.2354, "grad_norm": 0.8704105019569397, "learning_rate": 0.0002, "epoch": 3.477124183006536, "step": 5320}, {"loss": 1.2169, "grad_norm": 0.709102213382721, "learning_rate": 0.0002, "epoch": 3.4836601307189543, "step": 5330}, {"loss": 1.2425, "grad_norm": 0.6273632049560547, "learning_rate": 0.0002, "epoch": 3.4901960784313726, "step": 5340}, {"loss": 1.1585, "grad_norm": 0.6807359457015991, "learning_rate": 0.0002, "epoch": 3.496732026143791, "step": 5350}, {"loss": 1.131, "grad_norm": 0.7085188627243042, "learning_rate": 0.0002, "epoch": 3.503267973856209, "step": 5360}, {"loss": 1.1159, "grad_norm": 0.6938307881355286, "learning_rate": 0.0002, "epoch": 3.5098039215686274, "step": 5370}, {"loss": 1.1397, "grad_norm": 0.8544146418571472, "learning_rate": 0.0002, "epoch": 3.5163398692810457, "step": 5380}, {"loss": 1.2181, "grad_norm": 0.7889642119407654, "learning_rate": 0.0002, "epoch": 3.522875816993464, "step": 5390}, {"loss": 1.1691, "grad_norm": 0.7858421206474304, "learning_rate": 0.0002, "epoch": 3.5294117647058822, "step": 5400}, {"loss": 1.2374, "grad_norm": 0.8547123074531555, "learning_rate": 0.0002, "epoch": 3.5359477124183005, "step": 5410}, {"loss": 1.196, "grad_norm": 0.8218181133270264, "learning_rate": 0.0002, "epoch": 3.542483660130719, "step": 5420}, {"loss": 1.1961, "grad_norm": 1.153623342514038, "learning_rate": 0.0002, "epoch": 3.549019607843137, "step": 5430}, {"loss": 1.156, "grad_norm": 1.1321099996566772, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 5440}, {"loss": 1.2224, "grad_norm": 0.9495334029197693, "learning_rate": 0.0002, "epoch": 3.5620915032679736, "step": 5450}, {"loss": 1.2869, "grad_norm": 0.8743821978569031, "learning_rate": 0.0002, "epoch": 3.568627450980392, "step": 5460}, {"loss": 1.1018, "grad_norm": 0.7513086795806885, "learning_rate": 0.0002, "epoch": 3.57516339869281, "step": 5470}, {"loss": 1.1082, "grad_norm": 1.0139480829238892, "learning_rate": 0.0002, "epoch": 3.581699346405229, "step": 5480}, {"loss": 1.1706, "grad_norm": 0.6615135073661804, "learning_rate": 0.0002, "epoch": 3.588235294117647, "step": 5490}, {"loss": 1.3906, "grad_norm": 1.180798888206482, "learning_rate": 0.0002, "epoch": 3.5947712418300655, "step": 5500}, {"loss": 1.2391, "grad_norm": 0.7085279226303101, "learning_rate": 0.0002, "epoch": 3.6013071895424837, "step": 5510}, {"loss": 1.1623, "grad_norm": 0.540268063545227, "learning_rate": 0.0002, "epoch": 3.607843137254902, "step": 5520}, {"loss": 1.2132, "grad_norm": 0.7905671000480652, "learning_rate": 0.0002, "epoch": 3.6143790849673203, "step": 5530}, {"loss": 1.2731, "grad_norm": 0.8457717299461365, "learning_rate": 0.0002, "epoch": 3.6209150326797386, "step": 5540}, {"loss": 1.1799, "grad_norm": 0.7102677822113037, "learning_rate": 0.0002, "epoch": 3.627450980392157, "step": 5550}, {"loss": 1.2394, "grad_norm": 0.7179514765739441, "learning_rate": 0.0002, "epoch": 3.633986928104575, "step": 5560}, {"loss": 1.2019, "grad_norm": 1.0854148864746094, "learning_rate": 0.0002, "epoch": 3.6405228758169934, "step": 5570}, {"loss": 1.1986, "grad_norm": 0.8209951519966125, "learning_rate": 0.0002, "epoch": 3.6470588235294117, "step": 5580}, {"loss": 1.2289, "grad_norm": 0.6944138407707214, "learning_rate": 0.0002, "epoch": 3.65359477124183, "step": 5590}, {"loss": 1.3226, "grad_norm": 0.7675473093986511, "learning_rate": 0.0002, "epoch": 3.6601307189542482, "step": 5600}, {"loss": 1.2866, "grad_norm": 0.6683364510536194, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 5610}, {"loss": 1.1099, "grad_norm": 0.7920727133750916, "learning_rate": 0.0002, "epoch": 3.6732026143790852, "step": 5620}, {"loss": 1.2287, "grad_norm": 0.9440218806266785, "learning_rate": 0.0002, "epoch": 3.6797385620915035, "step": 5630}, {"loss": 1.2444, "grad_norm": 0.6600824594497681, "learning_rate": 0.0002, "epoch": 3.686274509803922, "step": 5640}, {"loss": 1.191, "grad_norm": 0.6860619187355042, "learning_rate": 0.0002, "epoch": 3.69281045751634, "step": 5650}, {"loss": 1.1914, "grad_norm": 0.6579713225364685, "learning_rate": 0.0002, "epoch": 3.6993464052287583, "step": 5660}, {"loss": 1.1464, "grad_norm": 0.661081075668335, "learning_rate": 0.0002, "epoch": 3.7058823529411766, "step": 5670}, {"loss": 1.289, "grad_norm": 1.0968825817108154, "learning_rate": 0.0002, "epoch": 3.712418300653595, "step": 5680}, {"loss": 1.192, "grad_norm": 0.8066844940185547, "learning_rate": 0.0002, "epoch": 3.718954248366013, "step": 5690}, {"loss": 1.2322, "grad_norm": 0.8341682553291321, "learning_rate": 0.0002, "epoch": 3.7254901960784315, "step": 5700}, {"loss": 1.1473, "grad_norm": 0.6682852506637573, "learning_rate": 0.0002, "epoch": 3.7320261437908497, "step": 5710}, {"loss": 1.1566, "grad_norm": 0.898595929145813, "learning_rate": 0.0002, "epoch": 3.738562091503268, "step": 5720}, {"loss": 1.0919, "grad_norm": 0.6876054406166077, "learning_rate": 0.0002, "epoch": 3.7450980392156863, "step": 5730}, {"loss": 1.2302, "grad_norm": 0.7817103266716003, "learning_rate": 0.0002, "epoch": 3.7516339869281046, "step": 5740}, {"loss": 1.2439, "grad_norm": 0.5840168595314026, "learning_rate": 0.0002, "epoch": 3.758169934640523, "step": 5750}, {"loss": 1.1279, "grad_norm": 0.6263918876647949, "learning_rate": 0.0002, "epoch": 3.764705882352941, "step": 5760}, {"loss": 1.2023, "grad_norm": 0.7948952317237854, "learning_rate": 0.0002, "epoch": 3.7712418300653594, "step": 5770}, {"loss": 1.149, "grad_norm": 0.6700998544692993, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 5780}, {"loss": 1.3207, "grad_norm": 1.1169519424438477, "learning_rate": 0.0002, "epoch": 3.784313725490196, "step": 5790}, {"loss": 1.064, "grad_norm": 0.8354471325874329, "learning_rate": 0.0002, "epoch": 3.7908496732026142, "step": 5800}, {"loss": 1.2104, "grad_norm": 0.6304181814193726, "learning_rate": 0.0002, "epoch": 3.7973856209150325, "step": 5810}, {"loss": 1.2059, "grad_norm": 0.6919655799865723, "learning_rate": 0.0002, "epoch": 3.803921568627451, "step": 5820}, {"loss": 1.217, "grad_norm": 0.600385844707489, "learning_rate": 0.0002, "epoch": 3.810457516339869, "step": 5830}, {"loss": 1.2324, "grad_norm": 0.8406319618225098, "learning_rate": 0.0002, "epoch": 3.8169934640522873, "step": 5840}, {"loss": 1.2418, "grad_norm": 0.7594282031059265, "learning_rate": 0.0002, "epoch": 3.8235294117647056, "step": 5850}, {"loss": 1.1903, "grad_norm": 0.8179879784584045, "learning_rate": 0.0002, "epoch": 3.8300653594771243, "step": 5860}, {"loss": 1.255, "grad_norm": 1.141430377960205, "learning_rate": 0.0002, "epoch": 3.8366013071895426, "step": 5870}, {"loss": 1.1467, "grad_norm": 0.6595550775527954, "learning_rate": 0.0002, "epoch": 3.843137254901961, "step": 5880}, {"loss": 1.2378, "grad_norm": 0.7499435544013977, "learning_rate": 0.0002, "epoch": 3.849673202614379, "step": 5890}, {"loss": 1.217, "grad_norm": 0.7851517200469971, "learning_rate": 0.0002, "epoch": 3.8562091503267975, "step": 5900}, {"loss": 1.162, "grad_norm": 1.0533545017242432, "learning_rate": 0.0002, "epoch": 3.8627450980392157, "step": 5910}, {"loss": 1.3576, "grad_norm": 0.960086464881897, "learning_rate": 0.0002, "epoch": 3.869281045751634, "step": 5920}, {"loss": 1.151, "grad_norm": 0.9952049851417542, "learning_rate": 0.0002, "epoch": 3.8758169934640523, "step": 5930}, {"loss": 1.2027, "grad_norm": 0.7884191274642944, "learning_rate": 0.0002, "epoch": 3.8823529411764706, "step": 5940}, {"loss": 1.1796, "grad_norm": 0.7461766600608826, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 5950}, {"loss": 1.2251, "grad_norm": 0.9594355821609497, "learning_rate": 0.0002, "epoch": 3.895424836601307, "step": 5960}, {"loss": 1.1164, "grad_norm": 0.8179471492767334, "learning_rate": 0.0002, "epoch": 3.9019607843137254, "step": 5970}, {"loss": 1.2421, "grad_norm": 0.8240267634391785, "learning_rate": 0.0002, "epoch": 3.9084967320261437, "step": 5980}, {"loss": 1.3076, "grad_norm": 0.7462618350982666, "learning_rate": 0.0002, "epoch": 3.915032679738562, "step": 5990}, {"loss": 1.2124, "grad_norm": 0.711207389831543, "learning_rate": 0.0002, "epoch": 3.9215686274509802, "step": 6000}, {"loss": 1.2119, "grad_norm": 0.6910956501960754, "learning_rate": 0.0002, "epoch": 3.928104575163399, "step": 6010}, {"loss": 1.2127, "grad_norm": 0.749093770980835, "learning_rate": 0.0002, "epoch": 3.9346405228758172, "step": 6020}, {"loss": 1.1542, "grad_norm": 1.3332762718200684, "learning_rate": 0.0002, "epoch": 3.9411764705882355, "step": 6030}, {"loss": 1.1442, "grad_norm": 0.71457439661026, "learning_rate": 0.0002, "epoch": 3.947712418300654, "step": 6040}, {"loss": 1.339, "grad_norm": 1.1205238103866577, "learning_rate": 0.0002, "epoch": 3.954248366013072, "step": 6050}, {"loss": 1.2962, "grad_norm": 0.6958928108215332, "learning_rate": 0.0002, "epoch": 3.9607843137254903, "step": 6060}, {"loss": 1.1802, "grad_norm": 0.7518056035041809, "learning_rate": 0.0002, "epoch": 3.9673202614379086, "step": 6070}, {"loss": 1.1179, "grad_norm": 0.8010755777359009, "learning_rate": 0.0002, "epoch": 3.973856209150327, "step": 6080}, {"loss": 1.2867, "grad_norm": 0.7492658495903015, "learning_rate": 0.0002, "epoch": 3.980392156862745, "step": 6090}, {"loss": 1.2113, "grad_norm": 0.900704562664032, "learning_rate": 0.0002, "epoch": 3.9869281045751634, "step": 6100}, {"loss": 1.1106, "grad_norm": 0.7997331619262695, "learning_rate": 0.0002, "epoch": 3.9934640522875817, "step": 6110}, {"loss": 1.1244, "grad_norm": 0.7163209319114685, "learning_rate": 0.0002, "epoch": 4.0, "step": 6120}, {"eval_loss": 1.4113320112228394, "eval_runtime": 33.7199, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.631, "epoch": 4.0, "step": 6120}, {"loss": 1.0423, "grad_norm": 0.9527022838592529, "learning_rate": 0.0002, "epoch": 4.006535947712418, "step": 6130}, {"loss": 1.101, "grad_norm": 0.7603210210800171, "learning_rate": 0.0002, "epoch": 4.0130718954248366, "step": 6140}, {"loss": 1.1834, "grad_norm": 1.127387523651123, "learning_rate": 0.0002, "epoch": 4.019607843137255, "step": 6150}, {"loss": 1.0734, "grad_norm": 0.8290133476257324, "learning_rate": 0.0002, "epoch": 4.026143790849673, "step": 6160}, {"loss": 1.0785, "grad_norm": 0.9912241101264954, "learning_rate": 0.0002, "epoch": 4.032679738562091, "step": 6170}, {"loss": 1.0719, "grad_norm": 0.947005033493042, "learning_rate": 0.0002, "epoch": 4.03921568627451, "step": 6180}, {"loss": 1.0835, "grad_norm": 0.707466185092926, "learning_rate": 0.0002, "epoch": 4.045751633986928, "step": 6190}, {"loss": 1.1079, "grad_norm": 1.0604327917099, "learning_rate": 0.0002, "epoch": 4.052287581699346, "step": 6200}, {"loss": 1.0375, "grad_norm": 0.7848685383796692, "learning_rate": 0.0002, "epoch": 4.0588235294117645, "step": 6210}, {"loss": 1.1167, "grad_norm": 0.8475256562232971, "learning_rate": 0.0002, "epoch": 4.065359477124183, "step": 6220}, {"loss": 1.1104, "grad_norm": 0.9759448766708374, "learning_rate": 0.0002, "epoch": 4.071895424836601, "step": 6230}, {"loss": 1.1538, "grad_norm": 0.9324519038200378, "learning_rate": 0.0002, "epoch": 4.078431372549019, "step": 6240}, {"loss": 1.0817, "grad_norm": 0.8723901510238647, "learning_rate": 0.0002, "epoch": 4.084967320261438, "step": 6250}, {"loss": 1.0977, "grad_norm": 0.8343415856361389, "learning_rate": 0.0002, "epoch": 4.091503267973856, "step": 6260}, {"loss": 0.9887, "grad_norm": 0.7490310072898865, "learning_rate": 0.0002, "epoch": 4.098039215686274, "step": 6270}, {"loss": 1.2084, "grad_norm": 0.8961182832717896, "learning_rate": 0.0002, "epoch": 4.104575163398692, "step": 6280}, {"loss": 1.1349, "grad_norm": 0.7124854922294617, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 6290}, {"loss": 1.0081, "grad_norm": 0.8338138461112976, "learning_rate": 0.0002, "epoch": 4.117647058823529, "step": 6300}, {"loss": 1.1091, "grad_norm": 0.8075833320617676, "learning_rate": 0.0002, "epoch": 4.124183006535947, "step": 6310}, {"loss": 1.0193, "grad_norm": 0.8069391846656799, "learning_rate": 0.0002, "epoch": 4.130718954248366, "step": 6320}, {"loss": 0.948, "grad_norm": 0.9567893147468567, "learning_rate": 0.0002, "epoch": 4.137254901960785, "step": 6330}, {"loss": 1.0241, "grad_norm": 1.2184662818908691, "learning_rate": 0.0002, "epoch": 4.143790849673203, "step": 6340}, {"loss": 1.0756, "grad_norm": 1.030976414680481, "learning_rate": 0.0002, "epoch": 4.150326797385621, "step": 6350}, {"loss": 1.1124, "grad_norm": 0.9749957323074341, "learning_rate": 0.0002, "epoch": 4.1568627450980395, "step": 6360}, {"loss": 1.1038, "grad_norm": 0.7089483141899109, "learning_rate": 0.0002, "epoch": 4.163398692810458, "step": 6370}, {"loss": 1.2175, "grad_norm": 1.1084946393966675, "learning_rate": 0.0002, "epoch": 4.169934640522876, "step": 6380}, {"loss": 1.0274, "grad_norm": 0.7998497486114502, "learning_rate": 0.0002, "epoch": 4.176470588235294, "step": 6390}, {"loss": 1.005, "grad_norm": 0.8997811675071716, "learning_rate": 0.0002, "epoch": 4.183006535947713, "step": 6400}, {"loss": 1.0704, "grad_norm": 0.8359479904174805, "learning_rate": 0.0002, "epoch": 4.189542483660131, "step": 6410}, {"loss": 1.1056, "grad_norm": 0.9087472558021545, "learning_rate": 0.0002, "epoch": 4.196078431372549, "step": 6420}, {"loss": 1.0657, "grad_norm": 1.1100451946258545, "learning_rate": 0.0002, "epoch": 4.2026143790849675, "step": 6430}, {"loss": 1.1443, "grad_norm": 0.9376999735832214, "learning_rate": 0.0002, "epoch": 4.209150326797386, "step": 6440}, {"loss": 1.0862, "grad_norm": 0.8179266452789307, "learning_rate": 0.0002, "epoch": 4.215686274509804, "step": 6450}, {"loss": 1.0679, "grad_norm": 0.9953271746635437, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 6460}, {"loss": 1.1034, "grad_norm": 0.8476650714874268, "learning_rate": 0.0002, "epoch": 4.228758169934641, "step": 6470}, {"loss": 1.2512, "grad_norm": 0.8406323194503784, "learning_rate": 0.0002, "epoch": 4.235294117647059, "step": 6480}, {"loss": 1.057, "grad_norm": 0.819134533405304, "learning_rate": 0.0002, "epoch": 4.241830065359477, "step": 6490}, {"loss": 1.1082, "grad_norm": 0.7764983773231506, "learning_rate": 0.0002, "epoch": 4.248366013071895, "step": 6500}, {"loss": 1.1593, "grad_norm": 0.8252112865447998, "learning_rate": 0.0002, "epoch": 4.254901960784314, "step": 6510}, {"loss": 1.1369, "grad_norm": 0.7941019535064697, "learning_rate": 0.0002, "epoch": 4.261437908496732, "step": 6520}, {"loss": 1.0296, "grad_norm": 0.7673905491828918, "learning_rate": 0.0002, "epoch": 4.26797385620915, "step": 6530}, {"loss": 1.1387, "grad_norm": 0.8749890327453613, "learning_rate": 0.0002, "epoch": 4.2745098039215685, "step": 6540}, {"loss": 1.0595, "grad_norm": 0.7343207597732544, "learning_rate": 0.0002, "epoch": 4.281045751633987, "step": 6550}, {"loss": 1.1715, "grad_norm": 1.2786651849746704, "learning_rate": 0.0002, "epoch": 4.287581699346405, "step": 6560}, {"loss": 1.0514, "grad_norm": 1.316875696182251, "learning_rate": 0.0002, "epoch": 4.294117647058823, "step": 6570}, {"loss": 1.1125, "grad_norm": 0.8349189162254333, "learning_rate": 0.0002, "epoch": 4.300653594771242, "step": 6580}, {"loss": 1.0732, "grad_norm": 0.7510647177696228, "learning_rate": 0.0002, "epoch": 4.30718954248366, "step": 6590}, {"loss": 1.1387, "grad_norm": 0.932420551776886, "learning_rate": 0.0002, "epoch": 4.313725490196078, "step": 6600}, {"loss": 1.1115, "grad_norm": 0.8510616421699524, "learning_rate": 0.0002, "epoch": 4.3202614379084965, "step": 6610}, {"loss": 1.0957, "grad_norm": 0.7661547064781189, "learning_rate": 0.0002, "epoch": 4.326797385620915, "step": 6620}, {"loss": 1.2064, "grad_norm": 1.0370930433273315, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 6630}, {"loss": 1.1064, "grad_norm": 0.9302158951759338, "learning_rate": 0.0002, "epoch": 4.339869281045751, "step": 6640}, {"loss": 0.968, "grad_norm": 0.9203811883926392, "learning_rate": 0.0002, "epoch": 4.34640522875817, "step": 6650}, {"loss": 1.0123, "grad_norm": 0.9986332654953003, "learning_rate": 0.0002, "epoch": 4.352941176470588, "step": 6660}, {"loss": 1.1079, "grad_norm": 0.8001713156700134, "learning_rate": 0.0002, "epoch": 4.359477124183006, "step": 6670}, {"loss": 1.0248, "grad_norm": 0.829714298248291, "learning_rate": 0.0002, "epoch": 4.366013071895424, "step": 6680}, {"loss": 1.0389, "grad_norm": 0.8253079056739807, "learning_rate": 0.0002, "epoch": 4.372549019607844, "step": 6690}, {"loss": 1.1087, "grad_norm": 0.824666440486908, "learning_rate": 0.0002, "epoch": 4.379084967320262, "step": 6700}, {"loss": 1.1968, "grad_norm": 0.8872972130775452, "learning_rate": 0.0002, "epoch": 4.38562091503268, "step": 6710}, {"loss": 1.0474, "grad_norm": 0.8729761838912964, "learning_rate": 0.0002, "epoch": 4.392156862745098, "step": 6720}, {"loss": 1.0961, "grad_norm": 1.1367264986038208, "learning_rate": 0.0002, "epoch": 4.398692810457517, "step": 6730}, {"loss": 1.0184, "grad_norm": 0.9699058532714844, "learning_rate": 0.0002, "epoch": 4.405228758169935, "step": 6740}, {"loss": 1.006, "grad_norm": 0.8266763687133789, "learning_rate": 0.0002, "epoch": 4.411764705882353, "step": 6750}, {"loss": 1.0735, "grad_norm": 1.0249767303466797, "learning_rate": 0.0002, "epoch": 4.4183006535947715, "step": 6760}, {"loss": 1.1726, "grad_norm": 0.73606938123703, "learning_rate": 0.0002, "epoch": 4.42483660130719, "step": 6770}, {"loss": 1.1037, "grad_norm": 1.4050679206848145, "learning_rate": 0.0002, "epoch": 4.431372549019608, "step": 6780}, {"loss": 1.1418, "grad_norm": 1.1114081144332886, "learning_rate": 0.0002, "epoch": 4.437908496732026, "step": 6790}, {"loss": 0.9682, "grad_norm": 0.8031067848205566, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6800}, {"loss": 1.0753, "grad_norm": 0.8513566851615906, "learning_rate": 0.0002, "epoch": 4.450980392156863, "step": 6810}, {"loss": 1.1852, "grad_norm": 1.332741379737854, "learning_rate": 0.0002, "epoch": 4.457516339869281, "step": 6820}, {"loss": 1.0966, "grad_norm": 1.5032578706741333, "learning_rate": 0.0002, "epoch": 4.4640522875816995, "step": 6830}, {"loss": 1.1124, "grad_norm": 0.7677283883094788, "learning_rate": 0.0002, "epoch": 4.470588235294118, "step": 6840}, {"loss": 1.1501, "grad_norm": 0.989148736000061, "learning_rate": 0.0002, "epoch": 4.477124183006536, "step": 6850}, {"loss": 1.2239, "grad_norm": 1.5316275358200073, "learning_rate": 0.0002, "epoch": 4.483660130718954, "step": 6860}, {"loss": 1.1171, "grad_norm": 0.9427124261856079, "learning_rate": 0.0002, "epoch": 4.490196078431373, "step": 6870}, {"loss": 1.1314, "grad_norm": 1.215287685394287, "learning_rate": 0.0002, "epoch": 4.496732026143791, "step": 6880}, {"loss": 1.0809, "grad_norm": 0.7286760210990906, "learning_rate": 0.0002, "epoch": 4.503267973856209, "step": 6890}, {"loss": 1.0179, "grad_norm": 0.874829888343811, "learning_rate": 0.0002, "epoch": 4.509803921568627, "step": 6900}, {"loss": 1.0233, "grad_norm": 0.8058359622955322, "learning_rate": 0.0002, "epoch": 4.516339869281046, "step": 6910}, {"loss": 1.0463, "grad_norm": 1.248195767402649, "learning_rate": 0.0002, "epoch": 4.522875816993464, "step": 6920}, {"loss": 1.0347, "grad_norm": 0.8033645749092102, "learning_rate": 0.0002, "epoch": 4.529411764705882, "step": 6930}, {"loss": 1.1068, "grad_norm": 1.7361950874328613, "learning_rate": 0.0002, "epoch": 4.5359477124183005, "step": 6940}, {"loss": 0.9856, "grad_norm": 0.8058095574378967, "learning_rate": 0.0002, "epoch": 4.542483660130719, "step": 6950}, {"loss": 1.0057, "grad_norm": 1.254089593887329, "learning_rate": 0.0002, "epoch": 4.549019607843137, "step": 6960}, {"loss": 1.1723, "grad_norm": 0.9180455803871155, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 6970}, {"loss": 1.0559, "grad_norm": 0.6677682399749756, "learning_rate": 0.0002, "epoch": 4.562091503267974, "step": 6980}, {"loss": 1.0453, "grad_norm": 0.8127354383468628, "learning_rate": 0.0002, "epoch": 4.568627450980392, "step": 6990}, {"loss": 1.0828, "grad_norm": 1.0263001918792725, "learning_rate": 0.0002, "epoch": 4.57516339869281, "step": 7000}, {"loss": 1.0703, "grad_norm": 0.9641909003257751, "learning_rate": 0.0002, "epoch": 4.5816993464052285, "step": 7010}, {"loss": 1.179, "grad_norm": 0.9440861344337463, "learning_rate": 0.0002, "epoch": 4.588235294117647, "step": 7020}, {"loss": 1.0931, "grad_norm": 0.9539011716842651, "learning_rate": 0.0002, "epoch": 4.594771241830065, "step": 7030}, {"loss": 1.0963, "grad_norm": 1.0449910163879395, "learning_rate": 0.0002, "epoch": 4.601307189542483, "step": 7040}, {"loss": 0.9944, "grad_norm": 0.8766893744468689, "learning_rate": 0.0002, "epoch": 4.607843137254902, "step": 7050}, {"loss": 1.0169, "grad_norm": 0.6983462572097778, "learning_rate": 0.0002, "epoch": 4.61437908496732, "step": 7060}, {"loss": 1.1778, "grad_norm": 0.9505505561828613, "learning_rate": 0.0002, "epoch": 4.620915032679738, "step": 7070}, {"loss": 1.121, "grad_norm": 1.2506657838821411, "learning_rate": 0.0002, "epoch": 4.627450980392156, "step": 7080}, {"loss": 1.1329, "grad_norm": 0.9602801203727722, "learning_rate": 0.0002, "epoch": 4.633986928104575, "step": 7090}, {"loss": 1.1499, "grad_norm": 0.7398977875709534, "learning_rate": 0.0002, "epoch": 4.640522875816993, "step": 7100}, {"loss": 1.0769, "grad_norm": 1.3862425088882446, "learning_rate": 0.0002, "epoch": 4.647058823529412, "step": 7110}, {"loss": 1.0571, "grad_norm": 1.1451990604400635, "learning_rate": 0.0002, "epoch": 4.65359477124183, "step": 7120}, {"loss": 1.1271, "grad_norm": 0.9010422229766846, "learning_rate": 0.0002, "epoch": 4.660130718954249, "step": 7130}, {"loss": 1.0165, "grad_norm": 0.7102518081665039, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 7140}, {"loss": 1.0819, "grad_norm": 0.7963796257972717, "learning_rate": 0.0002, "epoch": 4.673202614379085, "step": 7150}, {"loss": 1.1114, "grad_norm": 0.7726007699966431, "learning_rate": 0.0002, "epoch": 4.6797385620915035, "step": 7160}, {"loss": 1.2088, "grad_norm": 0.8097564578056335, "learning_rate": 0.0002, "epoch": 4.686274509803922, "step": 7170}, {"loss": 1.1386, "grad_norm": 0.9070925116539001, "learning_rate": 0.0002, "epoch": 4.69281045751634, "step": 7180}, {"loss": 1.0315, "grad_norm": 0.7543528079986572, "learning_rate": 0.0002, "epoch": 4.699346405228758, "step": 7190}, {"loss": 1.0984, "grad_norm": 0.9900904893875122, "learning_rate": 0.0002, "epoch": 4.705882352941177, "step": 7200}, {"loss": 1.1552, "grad_norm": 0.8033412098884583, "learning_rate": 0.0002, "epoch": 4.712418300653595, "step": 7210}, {"loss": 1.1773, "grad_norm": 0.8440839052200317, "learning_rate": 0.0002, "epoch": 4.718954248366013, "step": 7220}, {"loss": 1.1258, "grad_norm": 0.9325555562973022, "learning_rate": 0.0002, "epoch": 4.7254901960784315, "step": 7230}, {"loss": 1.1384, "grad_norm": 0.7881146669387817, "learning_rate": 0.0002, "epoch": 4.73202614379085, "step": 7240}, {"loss": 1.1219, "grad_norm": 0.884453296661377, "learning_rate": 0.0002, "epoch": 4.738562091503268, "step": 7250}, {"loss": 1.1036, "grad_norm": 0.9274539351463318, "learning_rate": 0.0002, "epoch": 4.745098039215686, "step": 7260}, {"loss": 1.0906, "grad_norm": 1.2367479801177979, "learning_rate": 0.0002, "epoch": 4.751633986928105, "step": 7270}, {"loss": 1.0741, "grad_norm": 0.9499821066856384, "learning_rate": 0.0002, "epoch": 4.758169934640523, "step": 7280}, {"loss": 1.1625, "grad_norm": 2.1918580532073975, "learning_rate": 0.0002, "epoch": 4.764705882352941, "step": 7290}, {"loss": 0.954, "grad_norm": 0.8221880793571472, "learning_rate": 0.0002, "epoch": 4.771241830065359, "step": 7300}, {"loss": 1.1358, "grad_norm": 0.871972918510437, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 7310}, {"loss": 1.0599, "grad_norm": 0.8034510612487793, "learning_rate": 0.0002, "epoch": 4.784313725490196, "step": 7320}, {"loss": 1.1059, "grad_norm": 0.8959605693817139, "learning_rate": 0.0002, "epoch": 4.790849673202614, "step": 7330}, {"loss": 1.0176, "grad_norm": 1.2326215505599976, "learning_rate": 0.0002, "epoch": 4.7973856209150325, "step": 7340}, {"loss": 1.1095, "grad_norm": 0.9725791811943054, "learning_rate": 0.0002, "epoch": 4.803921568627451, "step": 7350}, {"loss": 1.1229, "grad_norm": 0.7240816354751587, "learning_rate": 0.0002, "epoch": 4.810457516339869, "step": 7360}, {"loss": 1.0669, "grad_norm": 0.8265769481658936, "learning_rate": 0.0002, "epoch": 4.816993464052287, "step": 7370}, {"loss": 1.042, "grad_norm": 0.8888696432113647, "learning_rate": 0.0002, "epoch": 4.823529411764706, "step": 7380}, {"loss": 1.0981, "grad_norm": 0.7776556015014648, "learning_rate": 0.0002, "epoch": 4.830065359477124, "step": 7390}, {"loss": 1.0819, "grad_norm": 0.8772371411323547, "learning_rate": 0.0002, "epoch": 4.836601307189542, "step": 7400}, {"loss": 1.0819, "grad_norm": 0.9786531925201416, "learning_rate": 0.0002, "epoch": 4.8431372549019605, "step": 7410}, {"loss": 1.1358, "grad_norm": 0.9059745073318481, "learning_rate": 0.0002, "epoch": 4.849673202614379, "step": 7420}, {"loss": 1.0324, "grad_norm": 0.7422552108764648, "learning_rate": 0.0002, "epoch": 4.856209150326797, "step": 7430}, {"loss": 1.0423, "grad_norm": 1.3040380477905273, "learning_rate": 0.0002, "epoch": 4.862745098039216, "step": 7440}, {"loss": 1.1161, "grad_norm": 1.3278473615646362, "learning_rate": 0.0002, "epoch": 4.8692810457516345, "step": 7450}, {"loss": 1.0713, "grad_norm": 1.2705849409103394, "learning_rate": 0.0002, "epoch": 4.875816993464053, "step": 7460}, {"loss": 1.0034, "grad_norm": 0.8837892413139343, "learning_rate": 0.0002, "epoch": 4.882352941176471, "step": 7470}, {"loss": 1.1716, "grad_norm": 0.8670691251754761, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 7480}, {"loss": 1.1723, "grad_norm": 0.9662758111953735, "learning_rate": 0.0002, "epoch": 4.895424836601308, "step": 7490}, {"loss": 1.1056, "grad_norm": 0.8188302516937256, "learning_rate": 0.0002, "epoch": 4.901960784313726, "step": 7500}, {"loss": 1.0419, "grad_norm": 0.769442617893219, "learning_rate": 0.0002, "epoch": 4.908496732026144, "step": 7510}, {"loss": 1.1671, "grad_norm": 1.1465084552764893, "learning_rate": 0.0002, "epoch": 4.915032679738562, "step": 7520}, {"loss": 1.0768, "grad_norm": 1.253214955329895, "learning_rate": 0.0002, "epoch": 4.921568627450981, "step": 7530}, {"loss": 1.011, "grad_norm": 0.7922375202178955, "learning_rate": 0.0002, "epoch": 4.928104575163399, "step": 7540}, {"loss": 1.1256, "grad_norm": 0.8306851387023926, "learning_rate": 0.0002, "epoch": 4.934640522875817, "step": 7550}, {"loss": 1.206, "grad_norm": 0.8486151099205017, "learning_rate": 0.0002, "epoch": 4.9411764705882355, "step": 7560}, {"loss": 1.0161, "grad_norm": 1.2601467370986938, "learning_rate": 0.0002, "epoch": 4.947712418300654, "step": 7570}, {"loss": 1.1078, "grad_norm": 0.7980747818946838, "learning_rate": 0.0002, "epoch": 4.954248366013072, "step": 7580}, {"loss": 1.0607, "grad_norm": 0.8653254508972168, "learning_rate": 0.0002, "epoch": 4.96078431372549, "step": 7590}, {"loss": 1.0292, "grad_norm": 0.9680571556091309, "learning_rate": 0.0002, "epoch": 4.967320261437909, "step": 7600}, {"loss": 1.1795, "grad_norm": 0.9554466605186462, "learning_rate": 0.0002, "epoch": 4.973856209150327, "step": 7610}, {"loss": 1.0935, "grad_norm": 1.3693897724151611, "learning_rate": 0.0002, "epoch": 4.980392156862745, "step": 7620}, {"loss": 1.0838, "grad_norm": 0.7809282541275024, "learning_rate": 0.0002, "epoch": 4.9869281045751634, "step": 7630}, {"loss": 1.0844, "grad_norm": 0.7528006434440613, "learning_rate": 0.0002, "epoch": 4.993464052287582, "step": 7640}, {"loss": 0.9951, "grad_norm": 1.7491309642791748, "learning_rate": 0.0002, "epoch": 5.0, "step": 7650}, {"eval_loss": 1.4197258949279785, "eval_runtime": 33.6327, "eval_samples_per_second": 12.964, "eval_steps_per_second": 1.635, "epoch": 5.0, "step": 7650}, {"loss": 0.9744, "grad_norm": 0.8840063214302063, "learning_rate": 0.0002, "epoch": 5.006535947712418, "step": 7660}, {"loss": 1.0274, "grad_norm": 1.0118401050567627, "learning_rate": 0.0002, "epoch": 5.0130718954248366, "step": 7670}, {"loss": 1.1667, "grad_norm": 1.0040518045425415, "learning_rate": 0.0002, "epoch": 5.019607843137255, "step": 7680}, {"loss": 0.9426, "grad_norm": 0.7541199922561646, "learning_rate": 0.0002, "epoch": 5.026143790849673, "step": 7690}, {"loss": 1.0797, "grad_norm": 0.9106482863426208, "learning_rate": 0.0002, "epoch": 5.032679738562091, "step": 7700}, {"loss": 1.0096, "grad_norm": 1.3691469430923462, "learning_rate": 0.0002, "epoch": 5.03921568627451, "step": 7710}, {"loss": 0.9889, "grad_norm": 0.9449689388275146, "learning_rate": 0.0002, "epoch": 5.045751633986928, "step": 7720}, {"loss": 0.9087, "grad_norm": 1.1678508520126343, "learning_rate": 0.0002, "epoch": 5.052287581699346, "step": 7730}, {"loss": 1.0556, "grad_norm": 1.1296145915985107, "learning_rate": 0.0002, "epoch": 5.0588235294117645, "step": 7740}, {"loss": 0.9339, "grad_norm": 0.7863904237747192, "learning_rate": 0.0002, "epoch": 5.065359477124183, "step": 7750}, {"loss": 1.0135, "grad_norm": 0.8691433072090149, "learning_rate": 0.0002, "epoch": 5.071895424836601, "step": 7760}, {"loss": 0.9776, "grad_norm": 1.0722088813781738, "learning_rate": 0.0002, "epoch": 5.078431372549019, "step": 7770}, {"loss": 1.0595, "grad_norm": 0.9625038504600525, "learning_rate": 0.0002, "epoch": 5.084967320261438, "step": 7780}, {"loss": 1.0241, "grad_norm": 1.2618783712387085, "learning_rate": 0.0002, "epoch": 5.091503267973856, "step": 7790}, {"loss": 0.9396, "grad_norm": 0.9970650672912598, "learning_rate": 0.0002, "epoch": 5.098039215686274, "step": 7800}, {"loss": 0.9186, "grad_norm": 1.3946677446365356, "learning_rate": 0.0002, "epoch": 5.104575163398692, "step": 7810}, {"loss": 0.9957, "grad_norm": 1.0260052680969238, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 7820}, {"loss": 0.9865, "grad_norm": 1.105521559715271, "learning_rate": 0.0002, "epoch": 5.117647058823529, "step": 7830}, {"loss": 0.9788, "grad_norm": 1.003641128540039, "learning_rate": 0.0002, "epoch": 5.124183006535947, "step": 7840}, {"loss": 0.9688, "grad_norm": 1.0315021276474, "learning_rate": 0.0002, "epoch": 5.130718954248366, "step": 7850}, {"loss": 1.0001, "grad_norm": 0.9469530582427979, "learning_rate": 0.0002, "epoch": 5.137254901960785, "step": 7860}, {"loss": 0.9659, "grad_norm": 1.3244667053222656, "learning_rate": 0.0002, "epoch": 5.143790849673203, "step": 7870}, {"loss": 0.9657, "grad_norm": 1.1732033491134644, "learning_rate": 0.0002, "epoch": 5.150326797385621, "step": 7880}, {"loss": 0.9978, "grad_norm": 1.3129149675369263, "learning_rate": 0.0002, "epoch": 5.1568627450980395, "step": 7890}, {"loss": 0.9894, "grad_norm": 0.8589454293251038, "learning_rate": 0.0002, "epoch": 5.163398692810458, "step": 7900}, {"loss": 1.0161, "grad_norm": 0.8954233527183533, "learning_rate": 0.0002, "epoch": 5.169934640522876, "step": 7910}, {"loss": 0.8741, "grad_norm": 0.7426522970199585, "learning_rate": 0.0002, "epoch": 5.176470588235294, "step": 7920}, {"loss": 1.0106, "grad_norm": 1.1990121603012085, "learning_rate": 0.0002, "epoch": 5.183006535947713, "step": 7930}, {"loss": 0.9453, "grad_norm": 0.8867580890655518, "learning_rate": 0.0002, "epoch": 5.189542483660131, "step": 7940}, {"loss": 0.9727, "grad_norm": 1.016276478767395, "learning_rate": 0.0002, "epoch": 5.196078431372549, "step": 7950}, {"loss": 0.9908, "grad_norm": 1.0210685729980469, "learning_rate": 0.0002, "epoch": 5.2026143790849675, "step": 7960}, {"loss": 1.0522, "grad_norm": 1.0093122720718384, "learning_rate": 0.0002, "epoch": 5.209150326797386, "step": 7970}, {"loss": 1.0055, "grad_norm": 0.9746801853179932, "learning_rate": 0.0002, "epoch": 5.215686274509804, "step": 7980}, {"loss": 1.0611, "grad_norm": 0.9113537073135376, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 7990}, {"loss": 0.9167, "grad_norm": 1.2782206535339355, "learning_rate": 0.0002, "epoch": 5.228758169934641, "step": 8000}, {"loss": 1.0212, "grad_norm": 1.3223118782043457, "learning_rate": 0.0002, "epoch": 5.235294117647059, "step": 8010}, {"loss": 0.9244, "grad_norm": 0.7898629307746887, "learning_rate": 0.0002, "epoch": 5.241830065359477, "step": 8020}, {"loss": 1.0574, "grad_norm": 0.9822350740432739, "learning_rate": 0.0002, "epoch": 5.248366013071895, "step": 8030}, {"loss": 1.0102, "grad_norm": 1.5114340782165527, "learning_rate": 0.0002, "epoch": 5.254901960784314, "step": 8040}, {"loss": 0.9816, "grad_norm": 0.859006941318512, "learning_rate": 0.0002, "epoch": 5.261437908496732, "step": 8050}, {"loss": 0.9445, "grad_norm": 1.0495043992996216, "learning_rate": 0.0002, "epoch": 5.26797385620915, "step": 8060}, {"loss": 0.9724, "grad_norm": 1.329483151435852, "learning_rate": 0.0002, "epoch": 5.2745098039215685, "step": 8070}, {"loss": 0.9296, "grad_norm": 1.1333061456680298, "learning_rate": 0.0002, "epoch": 5.281045751633987, "step": 8080}, {"loss": 0.9577, "grad_norm": 0.8153108358383179, "learning_rate": 0.0002, "epoch": 5.287581699346405, "step": 8090}, {"loss": 0.9002, "grad_norm": 0.9395004510879517, "learning_rate": 0.0002, "epoch": 5.294117647058823, "step": 8100}, {"loss": 1.0371, "grad_norm": 0.8907593488693237, "learning_rate": 0.0002, "epoch": 5.300653594771242, "step": 8110}, {"loss": 0.9301, "grad_norm": 0.9808667898178101, "learning_rate": 0.0002, "epoch": 5.30718954248366, "step": 8120}, {"loss": 1.0136, "grad_norm": 0.984779417514801, "learning_rate": 0.0002, "epoch": 5.313725490196078, "step": 8130}, {"loss": 0.9621, "grad_norm": 0.9787270426750183, "learning_rate": 0.0002, "epoch": 5.3202614379084965, "step": 8140}, {"loss": 0.9336, "grad_norm": 0.9857710599899292, "learning_rate": 0.0002, "epoch": 5.326797385620915, "step": 8150}, {"loss": 0.9884, "grad_norm": 0.9774303436279297, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 8160}, {"loss": 1.0561, "grad_norm": 0.677925169467926, "learning_rate": 0.0002, "epoch": 5.339869281045751, "step": 8170}, {"loss": 1.1345, "grad_norm": 0.9576456546783447, "learning_rate": 0.0002, "epoch": 5.34640522875817, "step": 8180}, {"loss": 0.9554, "grad_norm": 1.8970937728881836, "learning_rate": 0.0002, "epoch": 5.352941176470588, "step": 8190}, {"loss": 1.0474, "grad_norm": 0.9458389282226562, "learning_rate": 0.0002, "epoch": 5.359477124183006, "step": 8200}, {"loss": 1.0365, "grad_norm": 1.761794924736023, "learning_rate": 0.0002, "epoch": 5.366013071895424, "step": 8210}, {"loss": 0.9426, "grad_norm": 1.0693724155426025, "learning_rate": 0.0002, "epoch": 5.372549019607844, "step": 8220}, {"loss": 1.0299, "grad_norm": 0.9025877714157104, "learning_rate": 0.0002, "epoch": 5.379084967320262, "step": 8230}, {"loss": 0.9652, "grad_norm": 1.258857250213623, "learning_rate": 0.0002, "epoch": 5.38562091503268, "step": 8240}, {"loss": 0.9735, "grad_norm": 1.084849238395691, "learning_rate": 0.0002, "epoch": 5.392156862745098, "step": 8250}, {"loss": 0.9999, "grad_norm": 0.9530340433120728, "learning_rate": 0.0002, "epoch": 5.398692810457517, "step": 8260}, {"loss": 1.0268, "grad_norm": 0.830240786075592, "learning_rate": 0.0002, "epoch": 5.405228758169935, "step": 8270}, {"loss": 1.0332, "grad_norm": 1.5807015895843506, "learning_rate": 0.0002, "epoch": 5.411764705882353, "step": 8280}, {"loss": 0.9146, "grad_norm": 0.9486905336380005, "learning_rate": 0.0002, "epoch": 5.4183006535947715, "step": 8290}, {"loss": 1.0336, "grad_norm": 1.0415093898773193, "learning_rate": 0.0002, "epoch": 5.42483660130719, "step": 8300}, {"loss": 0.8933, "grad_norm": 1.0501102209091187, "learning_rate": 0.0002, "epoch": 5.431372549019608, "step": 8310}, {"loss": 0.9983, "grad_norm": 0.9751836061477661, "learning_rate": 0.0002, "epoch": 5.437908496732026, "step": 8320}, {"loss": 1.0755, "grad_norm": 1.5529173612594604, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 8330}, {"loss": 0.9814, "grad_norm": 0.8314350247383118, "learning_rate": 0.0002, "epoch": 5.450980392156863, "step": 8340}, {"loss": 1.0596, "grad_norm": 1.2555103302001953, "learning_rate": 0.0002, "epoch": 5.457516339869281, "step": 8350}, {"loss": 1.0127, "grad_norm": 0.9408367872238159, "learning_rate": 0.0002, "epoch": 5.4640522875816995, "step": 8360}, {"loss": 0.9241, "grad_norm": 0.9483312964439392, "learning_rate": 0.0002, "epoch": 5.470588235294118, "step": 8370}, {"loss": 0.9678, "grad_norm": 0.957905650138855, "learning_rate": 0.0002, "epoch": 5.477124183006536, "step": 8380}, {"loss": 1.0985, "grad_norm": 1.4000147581100464, "learning_rate": 0.0002, "epoch": 5.483660130718954, "step": 8390}, {"loss": 0.9966, "grad_norm": 1.7032461166381836, "learning_rate": 0.0002, "epoch": 5.490196078431373, "step": 8400}, {"loss": 0.9539, "grad_norm": 0.8978716731071472, "learning_rate": 0.0002, "epoch": 5.496732026143791, "step": 8410}, {"loss": 0.9544, "grad_norm": 0.8659300804138184, "learning_rate": 0.0002, "epoch": 5.503267973856209, "step": 8420}, {"loss": 1.0526, "grad_norm": 1.3629727363586426, "learning_rate": 0.0002, "epoch": 5.509803921568627, "step": 8430}, {"loss": 0.9696, "grad_norm": 1.2741984128952026, "learning_rate": 0.0002, "epoch": 5.516339869281046, "step": 8440}, {"loss": 1.0191, "grad_norm": 1.3867180347442627, "learning_rate": 0.0002, "epoch": 5.522875816993464, "step": 8450}, {"loss": 1.0835, "grad_norm": 1.0662001371383667, "learning_rate": 0.0002, "epoch": 5.529411764705882, "step": 8460}, {"loss": 0.9779, "grad_norm": 1.7005380392074585, "learning_rate": 0.0002, "epoch": 5.5359477124183005, "step": 8470}, {"loss": 1.0221, "grad_norm": 1.3730385303497314, "learning_rate": 0.0002, "epoch": 5.542483660130719, "step": 8480}, {"loss": 0.9586, "grad_norm": 1.7737441062927246, "learning_rate": 0.0002, "epoch": 5.549019607843137, "step": 8490}, {"loss": 0.9729, "grad_norm": 0.907487690448761, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 8500}, {"loss": 0.9891, "grad_norm": 0.8882441520690918, "learning_rate": 0.0002, "epoch": 5.562091503267974, "step": 8510}, {"loss": 0.973, "grad_norm": 0.8655388951301575, "learning_rate": 0.0002, "epoch": 5.568627450980392, "step": 8520}, {"loss": 0.9523, "grad_norm": 1.379992961883545, "learning_rate": 0.0002, "epoch": 5.57516339869281, "step": 8530}, {"loss": 1.0174, "grad_norm": 1.0021201372146606, "learning_rate": 0.0002, "epoch": 5.5816993464052285, "step": 8540}, {"loss": 1.0113, "grad_norm": 1.2636926174163818, "learning_rate": 0.0002, "epoch": 5.588235294117647, "step": 8550}, {"loss": 1.0243, "grad_norm": 1.279025912284851, "learning_rate": 0.0002, "epoch": 5.594771241830065, "step": 8560}, {"loss": 0.9917, "grad_norm": 0.8885834217071533, "learning_rate": 0.0002, "epoch": 5.601307189542483, "step": 8570}, {"loss": 0.9849, "grad_norm": 1.1975032091140747, "learning_rate": 0.0002, "epoch": 5.607843137254902, "step": 8580}, {"loss": 1.0363, "grad_norm": 1.005470871925354, "learning_rate": 0.0002, "epoch": 5.61437908496732, "step": 8590}, {"loss": 0.9947, "grad_norm": 1.104286551475525, "learning_rate": 0.0002, "epoch": 5.620915032679738, "step": 8600}, {"loss": 1.0585, "grad_norm": 1.435445785522461, "learning_rate": 0.0002, "epoch": 5.627450980392156, "step": 8610}, {"loss": 0.9156, "grad_norm": 1.0270172357559204, "learning_rate": 0.0002, "epoch": 5.633986928104575, "step": 8620}, {"loss": 1.0522, "grad_norm": 1.0929527282714844, "learning_rate": 0.0002, "epoch": 5.640522875816993, "step": 8630}, {"loss": 0.9694, "grad_norm": 1.1061221361160278, "learning_rate": 0.0002, "epoch": 5.647058823529412, "step": 8640}, {"loss": 1.0826, "grad_norm": 0.9563149213790894, "learning_rate": 0.0002, "epoch": 5.65359477124183, "step": 8650}, {"loss": 1.0042, "grad_norm": 1.0434954166412354, "learning_rate": 0.0002, "epoch": 5.660130718954249, "step": 8660}, {"loss": 0.9463, "grad_norm": 1.3695117235183716, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 8670}, {"loss": 0.9441, "grad_norm": 1.0540564060211182, "learning_rate": 0.0002, "epoch": 5.673202614379085, "step": 8680}, {"loss": 0.9755, "grad_norm": 1.5942492485046387, "learning_rate": 0.0002, "epoch": 5.6797385620915035, "step": 8690}, {"loss": 1.0071, "grad_norm": 0.9485495090484619, "learning_rate": 0.0002, "epoch": 5.686274509803922, "step": 8700}, {"loss": 0.9998, "grad_norm": 1.1483162641525269, "learning_rate": 0.0002, "epoch": 5.69281045751634, "step": 8710}, {"loss": 0.9578, "grad_norm": 0.9075471758842468, "learning_rate": 0.0002, "epoch": 5.699346405228758, "step": 8720}, {"loss": 0.9488, "grad_norm": 1.7908551692962646, "learning_rate": 0.0002, "epoch": 5.705882352941177, "step": 8730}, {"loss": 1.0163, "grad_norm": 0.8867162466049194, "learning_rate": 0.0002, "epoch": 5.712418300653595, "step": 8740}, {"loss": 1.0041, "grad_norm": 1.7165148258209229, "learning_rate": 0.0002, "epoch": 5.718954248366013, "step": 8750}, {"loss": 1.1061, "grad_norm": 0.9529356956481934, "learning_rate": 0.0002, "epoch": 5.7254901960784315, "step": 8760}, {"loss": 1.1119, "grad_norm": 1.01852548122406, "learning_rate": 0.0002, "epoch": 5.73202614379085, "step": 8770}, {"loss": 1.0471, "grad_norm": 0.9538423418998718, "learning_rate": 0.0002, "epoch": 5.738562091503268, "step": 8780}, {"loss": 1.0913, "grad_norm": 0.9007737636566162, "learning_rate": 0.0002, "epoch": 5.745098039215686, "step": 8790}, {"loss": 0.9766, "grad_norm": 0.9107874035835266, "learning_rate": 0.0002, "epoch": 5.751633986928105, "step": 8800}, {"loss": 0.9212, "grad_norm": 0.7379238605499268, "learning_rate": 0.0002, "epoch": 5.758169934640523, "step": 8810}, {"loss": 1.0966, "grad_norm": 1.072645902633667, "learning_rate": 0.0002, "epoch": 5.764705882352941, "step": 8820}, {"loss": 1.0845, "grad_norm": 1.002008080482483, "learning_rate": 0.0002, "epoch": 5.771241830065359, "step": 8830}, {"loss": 0.9978, "grad_norm": 1.0435924530029297, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 8840}, {"loss": 0.9458, "grad_norm": 0.9874551296234131, "learning_rate": 0.0002, "epoch": 5.784313725490196, "step": 8850}, {"loss": 1.1241, "grad_norm": 1.1729662418365479, "learning_rate": 0.0002, "epoch": 5.790849673202614, "step": 8860}, {"loss": 1.0451, "grad_norm": 1.3300775289535522, "learning_rate": 0.0002, "epoch": 5.7973856209150325, "step": 8870}, {"loss": 1.0989, "grad_norm": 1.612707257270813, "learning_rate": 0.0002, "epoch": 5.803921568627451, "step": 8880}, {"loss": 0.9119, "grad_norm": 0.9047797322273254, "learning_rate": 0.0002, "epoch": 5.810457516339869, "step": 8890}, {"loss": 0.989, "grad_norm": 1.0958741903305054, "learning_rate": 0.0002, "epoch": 5.816993464052287, "step": 8900}, {"loss": 1.1922, "grad_norm": 1.0099612474441528, "learning_rate": 0.0002, "epoch": 5.823529411764706, "step": 8910}, {"loss": 1.0623, "grad_norm": 0.8442328572273254, "learning_rate": 0.0002, "epoch": 5.830065359477124, "step": 8920}, {"loss": 0.9134, "grad_norm": 1.1388301849365234, "learning_rate": 0.0002, "epoch": 5.836601307189542, "step": 8930}, {"loss": 1.0019, "grad_norm": 0.8296026587486267, "learning_rate": 0.0002, "epoch": 5.8431372549019605, "step": 8940}, {"loss": 1.0363, "grad_norm": 1.0843533277511597, "learning_rate": 0.0002, "epoch": 5.849673202614379, "step": 8950}, {"loss": 1.0009, "grad_norm": 0.8496834635734558, "learning_rate": 0.0002, "epoch": 5.856209150326797, "step": 8960}, {"loss": 0.9927, "grad_norm": 1.6894690990447998, "learning_rate": 0.0002, "epoch": 5.862745098039216, "step": 8970}, {"loss": 1.0939, "grad_norm": 1.0012282133102417, "learning_rate": 0.0002, "epoch": 5.8692810457516345, "step": 8980}, {"loss": 0.9722, "grad_norm": 0.8521103262901306, "learning_rate": 0.0002, "epoch": 5.875816993464053, "step": 8990}, {"loss": 1.0885, "grad_norm": 1.246841311454773, "learning_rate": 0.0002, "epoch": 5.882352941176471, "step": 9000}, {"loss": 0.9702, "grad_norm": 0.9941892027854919, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 9010}, {"loss": 0.8754, "grad_norm": 1.067413568496704, "learning_rate": 0.0002, "epoch": 5.895424836601308, "step": 9020}, {"loss": 1.0153, "grad_norm": 1.0045088529586792, "learning_rate": 0.0002, "epoch": 5.901960784313726, "step": 9030}, {"loss": 1.0134, "grad_norm": 1.383063554763794, "learning_rate": 0.0002, "epoch": 5.908496732026144, "step": 9040}, {"loss": 1.0845, "grad_norm": 0.8754428625106812, "learning_rate": 0.0002, "epoch": 5.915032679738562, "step": 9050}, {"loss": 0.9571, "grad_norm": 0.8577388525009155, "learning_rate": 0.0002, "epoch": 5.921568627450981, "step": 9060}, {"loss": 1.0532, "grad_norm": 0.8718975186347961, "learning_rate": 0.0002, "epoch": 5.928104575163399, "step": 9070}, {"loss": 1.0667, "grad_norm": 1.1762131452560425, "learning_rate": 0.0002, "epoch": 5.934640522875817, "step": 9080}, {"loss": 1.1114, "grad_norm": 1.1025866270065308, "learning_rate": 0.0002, "epoch": 5.9411764705882355, "step": 9090}, {"loss": 0.9155, "grad_norm": 1.0439870357513428, "learning_rate": 0.0002, "epoch": 5.947712418300654, "step": 9100}, {"loss": 1.0055, "grad_norm": 1.2411525249481201, "learning_rate": 0.0002, "epoch": 5.954248366013072, "step": 9110}, {"loss": 0.9747, "grad_norm": 1.0317714214324951, "learning_rate": 0.0002, "epoch": 5.96078431372549, "step": 9120}, {"loss": 1.0352, "grad_norm": 0.9880492091178894, "learning_rate": 0.0002, "epoch": 5.967320261437909, "step": 9130}, {"loss": 1.0459, "grad_norm": 0.9039815664291382, "learning_rate": 0.0002, "epoch": 5.973856209150327, "step": 9140}, {"loss": 1.0413, "grad_norm": 0.9049116373062134, "learning_rate": 0.0002, "epoch": 5.980392156862745, "step": 9150}, {"loss": 0.9792, "grad_norm": 0.996749222278595, "learning_rate": 0.0002, "epoch": 5.9869281045751634, "step": 9160}, {"loss": 0.8857, "grad_norm": 0.8716062307357788, "learning_rate": 0.0002, "epoch": 5.993464052287582, "step": 9170}, {"loss": 1.019, "grad_norm": 1.3081294298171997, "learning_rate": 0.0002, "epoch": 6.0, "step": 9180}, {"eval_loss": 1.45111083984375, "eval_runtime": 34.7121, "eval_samples_per_second": 12.56, "eval_steps_per_second": 1.584, "epoch": 6.0, "step": 9180}, {"loss": 0.9306, "grad_norm": 1.1378029584884644, "learning_rate": 0.0002, "epoch": 6.006535947712418, "step": 9190}, {"loss": 0.8794, "grad_norm": 1.2921233177185059, "learning_rate": 0.0002, "epoch": 6.0130718954248366, "step": 9200}, {"loss": 0.8145, "grad_norm": 1.039211630821228, "learning_rate": 0.0002, "epoch": 6.019607843137255, "step": 9210}, {"loss": 0.8524, "grad_norm": 0.9715196490287781, "learning_rate": 0.0002, "epoch": 6.026143790849673, "step": 9220}, {"loss": 1.035, "grad_norm": 1.220642328262329, "learning_rate": 0.0002, "epoch": 6.032679738562091, "step": 9230}, {"loss": 0.8468, "grad_norm": 0.854360044002533, "learning_rate": 0.0002, "epoch": 6.03921568627451, "step": 9240}, {"loss": 0.8534, "grad_norm": 0.8806933164596558, "learning_rate": 0.0002, "epoch": 6.045751633986928, "step": 9250}, {"loss": 0.8305, "grad_norm": 1.4315874576568604, "learning_rate": 0.0002, "epoch": 6.052287581699346, "step": 9260}, {"loss": 0.8462, "grad_norm": 0.9382007122039795, "learning_rate": 0.0002, "epoch": 6.0588235294117645, "step": 9270}, {"loss": 0.9653, "grad_norm": 1.2184561491012573, "learning_rate": 0.0002, "epoch": 6.065359477124183, "step": 9280}, {"loss": 0.8806, "grad_norm": 1.2331548929214478, "learning_rate": 0.0002, "epoch": 6.071895424836601, "step": 9290}, {"loss": 0.8354, "grad_norm": 1.1112796068191528, "learning_rate": 0.0002, "epoch": 6.078431372549019, "step": 9300}, {"loss": 0.8008, "grad_norm": 1.4753731489181519, "learning_rate": 0.0002, "epoch": 6.084967320261438, "step": 9310}, {"loss": 0.9198, "grad_norm": 1.2783401012420654, "learning_rate": 0.0002, "epoch": 6.091503267973856, "step": 9320}, {"loss": 0.8294, "grad_norm": 0.9916909337043762, "learning_rate": 0.0002, "epoch": 6.098039215686274, "step": 9330}, {"loss": 0.876, "grad_norm": 0.9300099015235901, "learning_rate": 0.0002, "epoch": 6.104575163398692, "step": 9340}, {"loss": 0.9064, "grad_norm": 1.4985264539718628, "learning_rate": 0.0002, "epoch": 6.111111111111111, "step": 9350}, {"loss": 1.0106, "grad_norm": 1.276380181312561, "learning_rate": 0.0002, "epoch": 6.117647058823529, "step": 9360}, {"loss": 0.9068, "grad_norm": 1.181113600730896, "learning_rate": 0.0002, "epoch": 6.124183006535947, "step": 9370}, {"loss": 0.9165, "grad_norm": 1.698729395866394, "learning_rate": 0.0002, "epoch": 6.130718954248366, "step": 9380}, {"loss": 0.7997, "grad_norm": 0.9793189764022827, "learning_rate": 0.0002, "epoch": 6.137254901960785, "step": 9390}, {"loss": 0.9731, "grad_norm": 1.1942132711410522, "learning_rate": 0.0002, "epoch": 6.143790849673203, "step": 9400}, {"loss": 0.8762, "grad_norm": 1.2160184383392334, "learning_rate": 0.0002, "epoch": 6.150326797385621, "step": 9410}, {"loss": 0.801, "grad_norm": 1.0802825689315796, "learning_rate": 0.0002, "epoch": 6.1568627450980395, "step": 9420}, {"loss": 0.9055, "grad_norm": 3.024529218673706, "learning_rate": 0.0002, "epoch": 6.163398692810458, "step": 9430}, {"loss": 0.8739, "grad_norm": 0.975062370300293, "learning_rate": 0.0002, "epoch": 6.169934640522876, "step": 9440}, {"loss": 0.8485, "grad_norm": 0.9243306517601013, "learning_rate": 0.0002, "epoch": 6.176470588235294, "step": 9450}, {"loss": 0.947, "grad_norm": 0.8892099857330322, "learning_rate": 0.0002, "epoch": 6.183006535947713, "step": 9460}, {"loss": 0.9165, "grad_norm": 1.4151731729507446, "learning_rate": 0.0002, "epoch": 6.189542483660131, "step": 9470}, {"loss": 1.022, "grad_norm": 1.064701795578003, "learning_rate": 0.0002, "epoch": 6.196078431372549, "step": 9480}, {"loss": 0.906, "grad_norm": 1.1104519367218018, "learning_rate": 0.0002, "epoch": 6.2026143790849675, "step": 9490}, {"loss": 0.9572, "grad_norm": 1.4788947105407715, "learning_rate": 0.0002, "epoch": 6.209150326797386, "step": 9500}, {"loss": 0.8014, "grad_norm": 0.7976077795028687, "learning_rate": 0.0002, "epoch": 6.215686274509804, "step": 9510}, {"loss": 0.886, "grad_norm": 1.256864070892334, "learning_rate": 0.0002, "epoch": 6.222222222222222, "step": 9520}, {"loss": 0.9104, "grad_norm": 1.3874554634094238, "learning_rate": 0.0002, "epoch": 6.228758169934641, "step": 9530}, {"loss": 0.8583, "grad_norm": 1.9012963771820068, "learning_rate": 0.0002, "epoch": 6.235294117647059, "step": 9540}, {"loss": 0.9585, "grad_norm": 1.275212287902832, "learning_rate": 0.0002, "epoch": 6.241830065359477, "step": 9550}, {"loss": 0.8416, "grad_norm": 1.1007417440414429, "learning_rate": 0.0002, "epoch": 6.248366013071895, "step": 9560}, {"loss": 0.9191, "grad_norm": 1.0602147579193115, "learning_rate": 0.0002, "epoch": 6.254901960784314, "step": 9570}, {"loss": 0.909, "grad_norm": 1.2276418209075928, "learning_rate": 0.0002, "epoch": 6.261437908496732, "step": 9580}, {"loss": 0.9363, "grad_norm": 1.0111924409866333, "learning_rate": 0.0002, "epoch": 6.26797385620915, "step": 9590}, {"loss": 0.9941, "grad_norm": 0.9031485915184021, "learning_rate": 0.0002, "epoch": 6.2745098039215685, "step": 9600}, {"loss": 0.9138, "grad_norm": 0.9893783926963806, "learning_rate": 0.0002, "epoch": 6.281045751633987, "step": 9610}, {"loss": 0.9114, "grad_norm": 1.1979725360870361, "learning_rate": 0.0002, "epoch": 6.287581699346405, "step": 9620}, {"loss": 0.8858, "grad_norm": 1.380516767501831, "learning_rate": 0.0002, "epoch": 6.294117647058823, "step": 9630}, {"loss": 0.8898, "grad_norm": 1.1370083093643188, "learning_rate": 0.0002, "epoch": 6.300653594771242, "step": 9640}, {"loss": 0.9073, "grad_norm": 1.4091558456420898, "learning_rate": 0.0002, "epoch": 6.30718954248366, "step": 9650}, {"loss": 0.9096, "grad_norm": 1.0670944452285767, "learning_rate": 0.0002, "epoch": 6.313725490196078, "step": 9660}, {"loss": 0.9376, "grad_norm": 0.9150263667106628, "learning_rate": 0.0002, "epoch": 6.3202614379084965, "step": 9670}, {"loss": 0.9169, "grad_norm": 1.1342853307724, "learning_rate": 0.0002, "epoch": 6.326797385620915, "step": 9680}, {"loss": 1.002, "grad_norm": 1.2733415365219116, "learning_rate": 0.0002, "epoch": 6.333333333333333, "step": 9690}, {"loss": 0.9579, "grad_norm": 1.3647292852401733, "learning_rate": 0.0002, "epoch": 6.339869281045751, "step": 9700}, {"loss": 0.87, "grad_norm": 1.0435094833374023, "learning_rate": 0.0002, "epoch": 6.34640522875817, "step": 9710}, {"loss": 0.8812, "grad_norm": 1.3641071319580078, "learning_rate": 0.0002, "epoch": 6.352941176470588, "step": 9720}, {"loss": 0.8888, "grad_norm": 1.2806159257888794, "learning_rate": 0.0002, "epoch": 6.359477124183006, "step": 9730}, {"loss": 0.9481, "grad_norm": 1.0193076133728027, "learning_rate": 0.0002, "epoch": 6.366013071895424, "step": 9740}, {"loss": 0.931, "grad_norm": 1.2349408864974976, "learning_rate": 0.0002, "epoch": 6.372549019607844, "step": 9750}, {"loss": 0.8837, "grad_norm": 1.2062549591064453, "learning_rate": 0.0002, "epoch": 6.379084967320262, "step": 9760}, {"loss": 0.8947, "grad_norm": 1.4402194023132324, "learning_rate": 0.0002, "epoch": 6.38562091503268, "step": 9770}, {"loss": 0.8724, "grad_norm": 1.1730891466140747, "learning_rate": 0.0002, "epoch": 6.392156862745098, "step": 9780}, {"loss": 0.9005, "grad_norm": 1.1481093168258667, "learning_rate": 0.0002, "epoch": 6.398692810457517, "step": 9790}, {"loss": 0.9431, "grad_norm": 1.0012723207473755, "learning_rate": 0.0002, "epoch": 6.405228758169935, "step": 9800}, {"loss": 0.8856, "grad_norm": 0.8839848041534424, "learning_rate": 0.0002, "epoch": 6.411764705882353, "step": 9810}, {"loss": 0.8147, "grad_norm": 1.096693992614746, "learning_rate": 0.0002, "epoch": 6.4183006535947715, "step": 9820}, {"loss": 0.846, "grad_norm": 1.4713369607925415, "learning_rate": 0.0002, "epoch": 6.42483660130719, "step": 9830}, {"loss": 0.9563, "grad_norm": 1.2529761791229248, "learning_rate": 0.0002, "epoch": 6.431372549019608, "step": 9840}, {"loss": 0.8551, "grad_norm": 1.5575600862503052, "learning_rate": 0.0002, "epoch": 6.437908496732026, "step": 9850}, {"loss": 0.836, "grad_norm": 1.2188916206359863, "learning_rate": 0.0002, "epoch": 6.444444444444445, "step": 9860}, {"loss": 0.9132, "grad_norm": 1.1558794975280762, "learning_rate": 0.0002, "epoch": 6.450980392156863, "step": 9870}, {"loss": 0.8632, "grad_norm": 1.1506937742233276, "learning_rate": 0.0002, "epoch": 6.457516339869281, "step": 9880}, {"loss": 1.0575, "grad_norm": 1.1168335676193237, "learning_rate": 0.0002, "epoch": 6.4640522875816995, "step": 9890}, {"loss": 0.99, "grad_norm": 1.192449688911438, "learning_rate": 0.0002, "epoch": 6.470588235294118, "step": 9900}, {"loss": 0.9478, "grad_norm": 1.0451104640960693, "learning_rate": 0.0002, "epoch": 6.477124183006536, "step": 9910}, {"loss": 0.9034, "grad_norm": 1.1111775636672974, "learning_rate": 0.0002, "epoch": 6.483660130718954, "step": 9920}, {"loss": 0.8971, "grad_norm": 1.2094531059265137, "learning_rate": 0.0002, "epoch": 6.490196078431373, "step": 9930}, {"loss": 0.9047, "grad_norm": 1.0547380447387695, "learning_rate": 0.0002, "epoch": 6.496732026143791, "step": 9940}, {"loss": 1.0727, "grad_norm": 1.5547202825546265, "learning_rate": 0.0002, "epoch": 6.503267973856209, "step": 9950}, {"loss": 0.9109, "grad_norm": 1.1917903423309326, "learning_rate": 0.0002, "epoch": 6.509803921568627, "step": 9960}, {"loss": 0.8708, "grad_norm": 1.0918153524398804, "learning_rate": 0.0002, "epoch": 6.516339869281046, "step": 9970}, {"loss": 0.8752, "grad_norm": 1.146968960762024, "learning_rate": 0.0002, "epoch": 6.522875816993464, "step": 9980}, {"loss": 0.9593, "grad_norm": 0.9899234771728516, "learning_rate": 0.0002, "epoch": 6.529411764705882, "step": 9990}, {"loss": 0.91, "grad_norm": 2.160924196243286, "learning_rate": 0.0002, "epoch": 6.5359477124183005, "step": 10000}, {"loss": 0.9683, "grad_norm": 1.6366891860961914, "learning_rate": 0.0002, "epoch": 6.542483660130719, "step": 10010}, {"loss": 0.8582, "grad_norm": 0.9876762628555298, "learning_rate": 0.0002, "epoch": 6.549019607843137, "step": 10020}, {"loss": 0.8385, "grad_norm": 1.5622549057006836, "learning_rate": 0.0002, "epoch": 6.555555555555555, "step": 10030}, {"loss": 0.8791, "grad_norm": 1.0108020305633545, "learning_rate": 0.0002, "epoch": 6.562091503267974, "step": 10040}, {"loss": 0.9574, "grad_norm": 1.0725725889205933, "learning_rate": 0.0002, "epoch": 6.568627450980392, "step": 10050}, {"loss": 0.8297, "grad_norm": 1.1551216840744019, "learning_rate": 0.0002, "epoch": 6.57516339869281, "step": 10060}, {"loss": 0.8199, "grad_norm": 1.5174646377563477, "learning_rate": 0.0002, "epoch": 6.5816993464052285, "step": 10070}, {"loss": 0.8203, "grad_norm": 1.041877031326294, "learning_rate": 0.0002, "epoch": 6.588235294117647, "step": 10080}, {"loss": 0.9684, "grad_norm": 0.9939621686935425, "learning_rate": 0.0002, "epoch": 6.594771241830065, "step": 10090}, {"loss": 0.9324, "grad_norm": 1.2706589698791504, "learning_rate": 0.0002, "epoch": 6.601307189542483, "step": 10100}, {"loss": 0.9614, "grad_norm": 1.1071467399597168, "learning_rate": 0.0002, "epoch": 6.607843137254902, "step": 10110}, {"loss": 0.9747, "grad_norm": 0.9449541568756104, "learning_rate": 0.0002, "epoch": 6.61437908496732, "step": 10120}, {"loss": 0.9557, "grad_norm": 1.0961830615997314, "learning_rate": 0.0002, "epoch": 6.620915032679738, "step": 10130}, {"loss": 0.9865, "grad_norm": 1.7726300954818726, "learning_rate": 0.0002, "epoch": 6.627450980392156, "step": 10140}, {"loss": 0.9657, "grad_norm": 1.2345516681671143, "learning_rate": 0.0002, "epoch": 6.633986928104575, "step": 10150}, {"loss": 0.9573, "grad_norm": 1.2062907218933105, "learning_rate": 0.0002, "epoch": 6.640522875816993, "step": 10160}, {"loss": 0.918, "grad_norm": 1.029327154159546, "learning_rate": 0.0002, "epoch": 6.647058823529412, "step": 10170}, {"loss": 0.9211, "grad_norm": 1.442307710647583, "learning_rate": 0.0002, "epoch": 6.65359477124183, "step": 10180}, {"loss": 0.8924, "grad_norm": 1.2579066753387451, "learning_rate": 0.0002, "epoch": 6.660130718954249, "step": 10190}, {"loss": 0.9836, "grad_norm": 1.4563188552856445, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 10200}, {"loss": 0.8876, "grad_norm": 0.9699450135231018, "learning_rate": 0.0002, "epoch": 6.673202614379085, "step": 10210}, {"loss": 0.9589, "grad_norm": 1.812523603439331, "learning_rate": 0.0002, "epoch": 6.6797385620915035, "step": 10220}, {"loss": 1.0241, "grad_norm": 1.124000906944275, "learning_rate": 0.0002, "epoch": 6.686274509803922, "step": 10230}, {"loss": 0.8924, "grad_norm": 1.0957475900650024, "learning_rate": 0.0002, "epoch": 6.69281045751634, "step": 10240}, {"loss": 0.8891, "grad_norm": 0.989689826965332, "learning_rate": 0.0002, "epoch": 6.699346405228758, "step": 10250}, {"loss": 0.9049, "grad_norm": 1.4353317022323608, "learning_rate": 0.0002, "epoch": 6.705882352941177, "step": 10260}, {"loss": 0.9311, "grad_norm": 1.0245451927185059, "learning_rate": 0.0002, "epoch": 6.712418300653595, "step": 10270}, {"loss": 0.8814, "grad_norm": 1.097334861755371, "learning_rate": 0.0002, "epoch": 6.718954248366013, "step": 10280}, {"loss": 0.9927, "grad_norm": 0.982356071472168, "learning_rate": 0.0002, "epoch": 6.7254901960784315, "step": 10290}, {"loss": 0.9909, "grad_norm": 1.8842819929122925, "learning_rate": 0.0002, "epoch": 6.73202614379085, "step": 10300}, {"loss": 0.9286, "grad_norm": 0.8648947477340698, "learning_rate": 0.0002, "epoch": 6.738562091503268, "step": 10310}, {"loss": 0.987, "grad_norm": 1.1510577201843262, "learning_rate": 0.0002, "epoch": 6.745098039215686, "step": 10320}, {"loss": 0.9217, "grad_norm": 1.874495506286621, "learning_rate": 0.0002, "epoch": 6.751633986928105, "step": 10330}, {"loss": 0.8914, "grad_norm": 1.1126408576965332, "learning_rate": 0.0002, "epoch": 6.758169934640523, "step": 10340}, {"loss": 0.8508, "grad_norm": 1.6654644012451172, "learning_rate": 0.0002, "epoch": 6.764705882352941, "step": 10350}, {"loss": 0.9653, "grad_norm": 1.0699580907821655, "learning_rate": 0.0002, "epoch": 6.771241830065359, "step": 10360}, {"loss": 0.882, "grad_norm": 0.9460757374763489, "learning_rate": 0.0002, "epoch": 6.777777777777778, "step": 10370}, {"loss": 0.9589, "grad_norm": 1.2553058862686157, "learning_rate": 0.0002, "epoch": 6.784313725490196, "step": 10380}, {"loss": 0.8782, "grad_norm": 1.0939891338348389, "learning_rate": 0.0002, "epoch": 6.790849673202614, "step": 10390}, {"loss": 0.9189, "grad_norm": 1.0647451877593994, "learning_rate": 0.0002, "epoch": 6.7973856209150325, "step": 10400}, {"loss": 0.9478, "grad_norm": 1.0954521894454956, "learning_rate": 0.0002, "epoch": 6.803921568627451, "step": 10410}, {"loss": 1.0385, "grad_norm": 1.4371392726898193, "learning_rate": 0.0002, "epoch": 6.810457516339869, "step": 10420}, {"loss": 1.0024, "grad_norm": 1.0063464641571045, "learning_rate": 0.0002, "epoch": 6.816993464052287, "step": 10430}, {"loss": 0.8737, "grad_norm": 1.5189263820648193, "learning_rate": 0.0002, "epoch": 6.823529411764706, "step": 10440}, {"loss": 0.9246, "grad_norm": 0.9715501070022583, "learning_rate": 0.0002, "epoch": 6.830065359477124, "step": 10450}, {"loss": 0.9659, "grad_norm": 1.114586353302002, "learning_rate": 0.0002, "epoch": 6.836601307189542, "step": 10460}, {"loss": 1.0081, "grad_norm": 1.2991431951522827, "learning_rate": 0.0002, "epoch": 6.8431372549019605, "step": 10470}, {"loss": 0.9323, "grad_norm": 1.203114628791809, "learning_rate": 0.0002, "epoch": 6.849673202614379, "step": 10480}, {"loss": 1.0032, "grad_norm": 1.476167917251587, "learning_rate": 0.0002, "epoch": 6.856209150326797, "step": 10490}, {"loss": 1.0275, "grad_norm": 1.0933326482772827, "learning_rate": 0.0002, "epoch": 6.862745098039216, "step": 10500}, {"loss": 1.0068, "grad_norm": 1.2831504344940186, "learning_rate": 0.0002, "epoch": 6.8692810457516345, "step": 10510}, {"loss": 0.9973, "grad_norm": 1.1967637538909912, "learning_rate": 0.0002, "epoch": 6.875816993464053, "step": 10520}, {"loss": 0.9549, "grad_norm": 1.1276888847351074, "learning_rate": 0.0002, "epoch": 6.882352941176471, "step": 10530}, {"loss": 0.9568, "grad_norm": 1.2680490016937256, "learning_rate": 0.0002, "epoch": 6.888888888888889, "step": 10540}, {"loss": 0.9177, "grad_norm": 1.5469038486480713, "learning_rate": 0.0002, "epoch": 6.895424836601308, "step": 10550}, {"loss": 0.8545, "grad_norm": 1.1731038093566895, "learning_rate": 0.0002, "epoch": 6.901960784313726, "step": 10560}, {"loss": 0.9795, "grad_norm": 0.968008816242218, "learning_rate": 0.0002, "epoch": 6.908496732026144, "step": 10570}, {"loss": 0.9439, "grad_norm": 0.9082416892051697, "learning_rate": 0.0002, "epoch": 6.915032679738562, "step": 10580}, {"loss": 0.9898, "grad_norm": 1.5816899538040161, "learning_rate": 0.0002, "epoch": 6.921568627450981, "step": 10590}, {"loss": 0.9692, "grad_norm": 0.9462234377861023, "learning_rate": 0.0002, "epoch": 6.928104575163399, "step": 10600}, {"loss": 1.0193, "grad_norm": 1.4950200319290161, "learning_rate": 0.0002, "epoch": 6.934640522875817, "step": 10610}, {"loss": 0.8888, "grad_norm": 1.2929182052612305, "learning_rate": 0.0002, "epoch": 6.9411764705882355, "step": 10620}, {"loss": 1.0141, "grad_norm": 1.2995754480361938, "learning_rate": 0.0002, "epoch": 6.947712418300654, "step": 10630}, {"loss": 0.9863, "grad_norm": 0.9407122135162354, "learning_rate": 0.0002, "epoch": 6.954248366013072, "step": 10640}, {"loss": 0.9041, "grad_norm": 1.1735378503799438, "learning_rate": 0.0002, "epoch": 6.96078431372549, "step": 10650}, {"loss": 0.936, "grad_norm": 0.9937344193458557, "learning_rate": 0.0002, "epoch": 6.967320261437909, "step": 10660}, {"loss": 0.9577, "grad_norm": 1.2498728036880493, "learning_rate": 0.0002, "epoch": 6.973856209150327, "step": 10670}, {"loss": 1.0504, "grad_norm": 1.0513341426849365, "learning_rate": 0.0002, "epoch": 6.980392156862745, "step": 10680}, {"loss": 0.9259, "grad_norm": 1.4611467123031616, "learning_rate": 0.0002, "epoch": 6.9869281045751634, "step": 10690}, {"loss": 0.9779, "grad_norm": 1.2924799919128418, "learning_rate": 0.0002, "epoch": 6.993464052287582, "step": 10700}, {"loss": 0.8953, "grad_norm": 1.2024929523468018, "learning_rate": 0.0002, "epoch": 7.0, "step": 10710}, {"eval_loss": 1.4972445964813232, "eval_runtime": 33.6225, "eval_samples_per_second": 12.967, "eval_steps_per_second": 1.636, "epoch": 7.0, "step": 10710}, {"loss": 0.8111, "grad_norm": 1.1302162408828735, "learning_rate": 0.0002, "epoch": 7.006535947712418, "step": 10720}, {"loss": 0.8584, "grad_norm": 1.2731552124023438, "learning_rate": 0.0002, "epoch": 7.0130718954248366, "step": 10730}, {"loss": 0.782, "grad_norm": 1.2694480419158936, "learning_rate": 0.0002, "epoch": 7.019607843137255, "step": 10740}, {"loss": 0.7621, "grad_norm": 1.1517360210418701, "learning_rate": 0.0002, "epoch": 7.026143790849673, "step": 10750}, {"loss": 0.9149, "grad_norm": 1.3649171590805054, "learning_rate": 0.0002, "epoch": 7.032679738562091, "step": 10760}, {"loss": 0.7411, "grad_norm": 1.1630656719207764, "learning_rate": 0.0002, "epoch": 7.03921568627451, "step": 10770}, {"loss": 0.8514, "grad_norm": 1.2658313512802124, "learning_rate": 0.0002, "epoch": 7.045751633986928, "step": 10780}, {"loss": 0.8125, "grad_norm": 1.5004769563674927, "learning_rate": 0.0002, "epoch": 7.052287581699346, "step": 10790}, {"loss": 0.754, "grad_norm": 1.052678108215332, "learning_rate": 0.0002, "epoch": 7.0588235294117645, "step": 10800}, {"loss": 0.8613, "grad_norm": 1.3461277484893799, "learning_rate": 0.0002, "epoch": 7.065359477124183, "step": 10810}, {"loss": 0.843, "grad_norm": 1.3074769973754883, "learning_rate": 0.0002, "epoch": 7.071895424836601, "step": 10820}, {"loss": 0.7433, "grad_norm": 1.8454785346984863, "learning_rate": 0.0002, "epoch": 7.078431372549019, "step": 10830}, {"loss": 0.7899, "grad_norm": 0.9786653518676758, "learning_rate": 0.0002, "epoch": 7.084967320261438, "step": 10840}, {"loss": 0.7689, "grad_norm": 1.2760838270187378, "learning_rate": 0.0002, "epoch": 7.091503267973856, "step": 10850}, {"loss": 0.7715, "grad_norm": 1.1340841054916382, "learning_rate": 0.0002, "epoch": 7.098039215686274, "step": 10860}, {"loss": 0.8111, "grad_norm": 1.3808159828186035, "learning_rate": 0.0002, "epoch": 7.104575163398692, "step": 10870}, {"loss": 0.8311, "grad_norm": 1.147668719291687, "learning_rate": 0.0002, "epoch": 7.111111111111111, "step": 10880}, {"loss": 0.8408, "grad_norm": 1.3183035850524902, "learning_rate": 0.0002, "epoch": 7.117647058823529, "step": 10890}, {"loss": 0.7801, "grad_norm": 1.3882936239242554, "learning_rate": 0.0002, "epoch": 7.124183006535947, "step": 10900}, {"loss": 0.7745, "grad_norm": 0.9495398998260498, "learning_rate": 0.0002, "epoch": 7.130718954248366, "step": 10910}, {"loss": 0.8438, "grad_norm": 1.3810124397277832, "learning_rate": 0.0002, "epoch": 7.137254901960785, "step": 10920}, {"loss": 0.8028, "grad_norm": 1.563207745552063, "learning_rate": 0.0002, "epoch": 7.143790849673203, "step": 10930}, {"loss": 0.7562, "grad_norm": 1.2633056640625, "learning_rate": 0.0002, "epoch": 7.150326797385621, "step": 10940}, {"loss": 0.8592, "grad_norm": 1.2398860454559326, "learning_rate": 0.0002, "epoch": 7.1568627450980395, "step": 10950}, {"loss": 0.8467, "grad_norm": 1.166763186454773, "learning_rate": 0.0002, "epoch": 7.163398692810458, "step": 10960}, {"loss": 0.8346, "grad_norm": 1.5083234310150146, "learning_rate": 0.0002, "epoch": 7.169934640522876, "step": 10970}, {"loss": 0.8323, "grad_norm": 1.6927601099014282, "learning_rate": 0.0002, "epoch": 7.176470588235294, "step": 10980}, {"loss": 0.7434, "grad_norm": 1.090780258178711, "learning_rate": 0.0002, "epoch": 7.183006535947713, "step": 10990}, {"loss": 0.784, "grad_norm": 1.0077793598175049, "learning_rate": 0.0002, "epoch": 7.189542483660131, "step": 11000}, {"loss": 0.831, "grad_norm": 1.8293051719665527, "learning_rate": 0.0002, "epoch": 7.196078431372549, "step": 11010}, {"loss": 0.7654, "grad_norm": 1.0761457681655884, "learning_rate": 0.0002, "epoch": 7.2026143790849675, "step": 11020}, {"loss": 0.8395, "grad_norm": 1.0681469440460205, "learning_rate": 0.0002, "epoch": 7.209150326797386, "step": 11030}, {"loss": 0.7983, "grad_norm": 1.961199402809143, "learning_rate": 0.0002, "epoch": 7.215686274509804, "step": 11040}, {"loss": 0.8631, "grad_norm": 1.3750165700912476, "learning_rate": 0.0002, "epoch": 7.222222222222222, "step": 11050}, {"loss": 0.7425, "grad_norm": 1.647005319595337, "learning_rate": 0.0002, "epoch": 7.228758169934641, "step": 11060}, {"loss": 0.844, "grad_norm": 1.1073668003082275, "learning_rate": 0.0002, "epoch": 7.235294117647059, "step": 11070}, {"loss": 0.8292, "grad_norm": 1.450289011001587, "learning_rate": 0.0002, "epoch": 7.241830065359477, "step": 11080}, {"loss": 0.8505, "grad_norm": 1.191163420677185, "learning_rate": 0.0002, "epoch": 7.248366013071895, "step": 11090}, {"loss": 0.8331, "grad_norm": 1.6975404024124146, "learning_rate": 0.0002, "epoch": 7.254901960784314, "step": 11100}, {"loss": 0.8189, "grad_norm": 1.159091830253601, "learning_rate": 0.0002, "epoch": 7.261437908496732, "step": 11110}, {"loss": 0.8612, "grad_norm": 0.9952927827835083, "learning_rate": 0.0002, "epoch": 7.26797385620915, "step": 11120}, {"loss": 0.8299, "grad_norm": 1.4122034311294556, "learning_rate": 0.0002, "epoch": 7.2745098039215685, "step": 11130}, {"loss": 0.8274, "grad_norm": 1.4299325942993164, "learning_rate": 0.0002, "epoch": 7.281045751633987, "step": 11140}, {"loss": 0.9358, "grad_norm": 1.26812744140625, "learning_rate": 0.0002, "epoch": 7.287581699346405, "step": 11150}, {"loss": 0.8036, "grad_norm": 1.0740736722946167, "learning_rate": 0.0002, "epoch": 7.294117647058823, "step": 11160}, {"loss": 0.9005, "grad_norm": 1.1293542385101318, "learning_rate": 0.0002, "epoch": 7.300653594771242, "step": 11170}, {"loss": 0.8775, "grad_norm": 1.3161042928695679, "learning_rate": 0.0002, "epoch": 7.30718954248366, "step": 11180}, {"loss": 0.9586, "grad_norm": 1.5637391805648804, "learning_rate": 0.0002, "epoch": 7.313725490196078, "step": 11190}, {"loss": 0.8704, "grad_norm": 1.3164077997207642, "learning_rate": 0.0002, "epoch": 7.3202614379084965, "step": 11200}, {"loss": 0.8552, "grad_norm": 0.9268870949745178, "learning_rate": 0.0002, "epoch": 7.326797385620915, "step": 11210}, {"loss": 0.8719, "grad_norm": 1.164515733718872, "learning_rate": 0.0002, "epoch": 7.333333333333333, "step": 11220}, {"loss": 0.9011, "grad_norm": 1.6878753900527954, "learning_rate": 0.0002, "epoch": 7.339869281045751, "step": 11230}, {"loss": 0.8417, "grad_norm": 1.1870672702789307, "learning_rate": 0.0002, "epoch": 7.34640522875817, "step": 11240}, {"loss": 0.886, "grad_norm": 1.2923716306686401, "learning_rate": 0.0002, "epoch": 7.352941176470588, "step": 11250}, {"loss": 0.7763, "grad_norm": 1.2006791830062866, "learning_rate": 0.0002, "epoch": 7.359477124183006, "step": 11260}, {"loss": 0.7859, "grad_norm": 1.0424097776412964, "learning_rate": 0.0002, "epoch": 7.366013071895424, "step": 11270}, {"loss": 0.9427, "grad_norm": 1.349094033241272, "learning_rate": 0.0002, "epoch": 7.372549019607844, "step": 11280}, {"loss": 0.8384, "grad_norm": 1.4128005504608154, "learning_rate": 0.0002, "epoch": 7.379084967320262, "step": 11290}, {"loss": 0.9219, "grad_norm": 1.3647041320800781, "learning_rate": 0.0002, "epoch": 7.38562091503268, "step": 11300}, {"loss": 0.7865, "grad_norm": 1.0561704635620117, "learning_rate": 0.0002, "epoch": 7.392156862745098, "step": 11310}, {"loss": 0.9151, "grad_norm": 1.2405760288238525, "learning_rate": 0.0002, "epoch": 7.398692810457517, "step": 11320}, {"loss": 0.9108, "grad_norm": 1.0932328701019287, "learning_rate": 0.0002, "epoch": 7.405228758169935, "step": 11330}, {"loss": 0.873, "grad_norm": 1.206778883934021, "learning_rate": 0.0002, "epoch": 7.411764705882353, "step": 11340}, {"loss": 0.9062, "grad_norm": 1.5261255502700806, "learning_rate": 0.0002, "epoch": 7.4183006535947715, "step": 11350}, {"loss": 0.7795, "grad_norm": 1.4928070306777954, "learning_rate": 0.0002, "epoch": 7.42483660130719, "step": 11360}, {"loss": 0.9027, "grad_norm": 1.331190586090088, "learning_rate": 0.0002, "epoch": 7.431372549019608, "step": 11370}, {"loss": 0.8547, "grad_norm": 1.0745981931686401, "learning_rate": 0.0002, "epoch": 7.437908496732026, "step": 11380}, {"loss": 0.7755, "grad_norm": 1.3070036172866821, "learning_rate": 0.0002, "epoch": 7.444444444444445, "step": 11390}, {"loss": 0.7703, "grad_norm": 0.8743805885314941, "learning_rate": 0.0002, "epoch": 7.450980392156863, "step": 11400}, {"loss": 0.9331, "grad_norm": 1.2747994661331177, "learning_rate": 0.0002, "epoch": 7.457516339869281, "step": 11410}, {"loss": 0.8506, "grad_norm": 1.3688995838165283, "learning_rate": 0.0002, "epoch": 7.4640522875816995, "step": 11420}, {"loss": 0.783, "grad_norm": 1.1788195371627808, "learning_rate": 0.0002, "epoch": 7.470588235294118, "step": 11430}, {"loss": 0.8841, "grad_norm": 2.0186705589294434, "learning_rate": 0.0002, "epoch": 7.477124183006536, "step": 11440}, {"loss": 0.9182, "grad_norm": 1.1707696914672852, "learning_rate": 0.0002, "epoch": 7.483660130718954, "step": 11450}, {"loss": 0.9019, "grad_norm": 1.26426100730896, "learning_rate": 0.0002, "epoch": 7.490196078431373, "step": 11460}, {"loss": 0.8114, "grad_norm": 1.2673691511154175, "learning_rate": 0.0002, "epoch": 7.496732026143791, "step": 11470}, {"loss": 0.9716, "grad_norm": 1.038956642150879, "learning_rate": 0.0002, "epoch": 7.503267973856209, "step": 11480}, {"loss": 0.8066, "grad_norm": 1.216252326965332, "learning_rate": 0.0002, "epoch": 7.509803921568627, "step": 11490}, {"loss": 0.8546, "grad_norm": 1.1520167589187622, "learning_rate": 0.0002, "epoch": 7.516339869281046, "step": 11500}, {"loss": 0.8117, "grad_norm": 1.3962451219558716, "learning_rate": 0.0002, "epoch": 7.522875816993464, "step": 11510}, {"loss": 0.8636, "grad_norm": 1.2226953506469727, "learning_rate": 0.0002, "epoch": 7.529411764705882, "step": 11520}, {"loss": 0.8943, "grad_norm": 1.2891474962234497, "learning_rate": 0.0002, "epoch": 7.5359477124183005, "step": 11530}, {"loss": 0.8787, "grad_norm": 1.3372766971588135, "learning_rate": 0.0002, "epoch": 7.542483660130719, "step": 11540}, {"loss": 0.8103, "grad_norm": 1.4196370840072632, "learning_rate": 0.0002, "epoch": 7.549019607843137, "step": 11550}, {"loss": 0.8722, "grad_norm": 1.0041396617889404, "learning_rate": 0.0002, "epoch": 7.555555555555555, "step": 11560}, {"loss": 0.8512, "grad_norm": 1.3470606803894043, "learning_rate": 0.0002, "epoch": 7.562091503267974, "step": 11570}, {"loss": 0.8472, "grad_norm": 1.1738601922988892, "learning_rate": 0.0002, "epoch": 7.568627450980392, "step": 11580}, {"loss": 0.8816, "grad_norm": 1.1629133224487305, "learning_rate": 0.0002, "epoch": 7.57516339869281, "step": 11590}, {"loss": 0.8502, "grad_norm": 1.2859786748886108, "learning_rate": 0.0002, "epoch": 7.5816993464052285, "step": 11600}, {"loss": 0.8356, "grad_norm": 1.429398775100708, "learning_rate": 0.0002, "epoch": 7.588235294117647, "step": 11610}, {"loss": 0.7914, "grad_norm": 1.3300801515579224, "learning_rate": 0.0002, "epoch": 7.594771241830065, "step": 11620}, {"loss": 0.8671, "grad_norm": 1.3261592388153076, "learning_rate": 0.0002, "epoch": 7.601307189542483, "step": 11630}, {"loss": 0.7651, "grad_norm": 1.8779925107955933, "learning_rate": 0.0002, "epoch": 7.607843137254902, "step": 11640}, {"loss": 0.7666, "grad_norm": 1.7839158773422241, "learning_rate": 0.0002, "epoch": 7.61437908496732, "step": 11650}, {"loss": 0.9163, "grad_norm": 1.6469435691833496, "learning_rate": 0.0002, "epoch": 7.620915032679738, "step": 11660}, {"loss": 0.8589, "grad_norm": 1.5416018962860107, "learning_rate": 0.0002, "epoch": 7.627450980392156, "step": 11670}, {"loss": 0.7112, "grad_norm": 1.5173335075378418, "learning_rate": 0.0002, "epoch": 7.633986928104575, "step": 11680}, {"loss": 0.8958, "grad_norm": 1.1372658014297485, "learning_rate": 0.0002, "epoch": 7.640522875816993, "step": 11690}, {"loss": 0.8852, "grad_norm": 1.233030915260315, "learning_rate": 0.0002, "epoch": 7.647058823529412, "step": 11700}, {"loss": 0.9205, "grad_norm": 1.3100069761276245, "learning_rate": 0.0002, "epoch": 7.65359477124183, "step": 11710}, {"loss": 0.8895, "grad_norm": 0.9770023226737976, "learning_rate": 0.0002, "epoch": 7.660130718954249, "step": 11720}, {"loss": 0.8658, "grad_norm": 1.240946650505066, "learning_rate": 0.0002, "epoch": 7.666666666666667, "step": 11730}, {"loss": 0.9527, "grad_norm": 1.444226861000061, "learning_rate": 0.0002, "epoch": 7.673202614379085, "step": 11740}, {"loss": 0.8448, "grad_norm": 1.2667231559753418, "learning_rate": 0.0002, "epoch": 7.6797385620915035, "step": 11750}, {"loss": 0.9195, "grad_norm": 1.340754747390747, "learning_rate": 0.0002, "epoch": 7.686274509803922, "step": 11760}, {"loss": 0.8829, "grad_norm": 1.181988000869751, "learning_rate": 0.0002, "epoch": 7.69281045751634, "step": 11770}, {"loss": 0.8609, "grad_norm": 1.0623301267623901, "learning_rate": 0.0002, "epoch": 7.699346405228758, "step": 11780}, {"loss": 0.7726, "grad_norm": 1.1917353868484497, "learning_rate": 0.0002, "epoch": 7.705882352941177, "step": 11790}, {"loss": 0.8075, "grad_norm": 1.7202110290527344, "learning_rate": 0.0002, "epoch": 7.712418300653595, "step": 11800}, {"loss": 0.8112, "grad_norm": 1.1121439933776855, "learning_rate": 0.0002, "epoch": 7.718954248366013, "step": 11810}, {"loss": 0.7831, "grad_norm": 0.956794261932373, "learning_rate": 0.0002, "epoch": 7.7254901960784315, "step": 11820}, {"loss": 0.9056, "grad_norm": 1.2524380683898926, "learning_rate": 0.0002, "epoch": 7.73202614379085, "step": 11830}, {"loss": 0.8337, "grad_norm": 1.1095308065414429, "learning_rate": 0.0002, "epoch": 7.738562091503268, "step": 11840}, {"loss": 0.8921, "grad_norm": 1.631195068359375, "learning_rate": 0.0002, "epoch": 7.745098039215686, "step": 11850}, {"loss": 0.8493, "grad_norm": 1.2265965938568115, "learning_rate": 0.0002, "epoch": 7.751633986928105, "step": 11860}, {"loss": 0.8875, "grad_norm": 1.080328106880188, "learning_rate": 0.0002, "epoch": 7.758169934640523, "step": 11870}, {"loss": 0.8732, "grad_norm": 1.5570356845855713, "learning_rate": 0.0002, "epoch": 7.764705882352941, "step": 11880}, {"loss": 0.8293, "grad_norm": 1.3791661262512207, "learning_rate": 0.0002, "epoch": 7.771241830065359, "step": 11890}, {"loss": 0.8333, "grad_norm": 1.1457891464233398, "learning_rate": 0.0002, "epoch": 7.777777777777778, "step": 11900}, {"loss": 0.8546, "grad_norm": 1.6357585191726685, "learning_rate": 0.0002, "epoch": 7.784313725490196, "step": 11910}, {"loss": 0.9041, "grad_norm": 1.1845953464508057, "learning_rate": 0.0002, "epoch": 7.790849673202614, "step": 11920}, {"loss": 0.8364, "grad_norm": 1.2255016565322876, "learning_rate": 0.0002, "epoch": 7.7973856209150325, "step": 11930}, {"loss": 0.913, "grad_norm": 1.2113513946533203, "learning_rate": 0.0002, "epoch": 7.803921568627451, "step": 11940}, {"loss": 0.8553, "grad_norm": 1.0834609270095825, "learning_rate": 0.0002, "epoch": 7.810457516339869, "step": 11950}, {"loss": 0.8125, "grad_norm": 1.0127689838409424, "learning_rate": 0.0002, "epoch": 7.816993464052287, "step": 11960}, {"loss": 0.9468, "grad_norm": 1.1124000549316406, "learning_rate": 0.0002, "epoch": 7.823529411764706, "step": 11970}, {"loss": 0.8345, "grad_norm": 1.3440804481506348, "learning_rate": 0.0002, "epoch": 7.830065359477124, "step": 11980}, {"loss": 0.8939, "grad_norm": 1.8478741645812988, "learning_rate": 0.0002, "epoch": 7.836601307189542, "step": 11990}, {"loss": 0.9708, "grad_norm": 1.1202499866485596, "learning_rate": 0.0002, "epoch": 7.8431372549019605, "step": 12000}, {"loss": 0.8437, "grad_norm": 1.735700249671936, "learning_rate": 0.0002, "epoch": 7.849673202614379, "step": 12010}, {"loss": 0.8333, "grad_norm": 1.2994014024734497, "learning_rate": 0.0002, "epoch": 7.856209150326797, "step": 12020}, {"loss": 0.8656, "grad_norm": 1.8655444383621216, "learning_rate": 0.0002, "epoch": 7.862745098039216, "step": 12030}, {"loss": 0.8919, "grad_norm": 1.0460877418518066, "learning_rate": 0.0002, "epoch": 7.8692810457516345, "step": 12040}, {"loss": 0.8603, "grad_norm": 1.5241339206695557, "learning_rate": 0.0002, "epoch": 7.875816993464053, "step": 12050}, {"loss": 0.8179, "grad_norm": 1.171849250793457, "learning_rate": 0.0002, "epoch": 7.882352941176471, "step": 12060}, {"loss": 0.8577, "grad_norm": 1.0957022905349731, "learning_rate": 0.0002, "epoch": 7.888888888888889, "step": 12070}, {"loss": 0.9212, "grad_norm": 1.4121248722076416, "learning_rate": 0.0002, "epoch": 7.895424836601308, "step": 12080}, {"loss": 1.0002, "grad_norm": 1.3393208980560303, "learning_rate": 0.0002, "epoch": 7.901960784313726, "step": 12090}, {"loss": 0.8959, "grad_norm": 1.1252245903015137, "learning_rate": 0.0002, "epoch": 7.908496732026144, "step": 12100}, {"loss": 0.8494, "grad_norm": 1.4131813049316406, "learning_rate": 0.0002, "epoch": 7.915032679738562, "step": 12110}, {"loss": 0.9106, "grad_norm": 1.2392992973327637, "learning_rate": 0.0002, "epoch": 7.921568627450981, "step": 12120}, {"loss": 0.7951, "grad_norm": 1.3233672380447388, "learning_rate": 0.0002, "epoch": 7.928104575163399, "step": 12130}, {"loss": 0.7853, "grad_norm": 1.2547026872634888, "learning_rate": 0.0002, "epoch": 7.934640522875817, "step": 12140}, {"loss": 0.9203, "grad_norm": 1.1143239736557007, "learning_rate": 0.0002, "epoch": 7.9411764705882355, "step": 12150}, {"loss": 0.8059, "grad_norm": 1.030006766319275, "learning_rate": 0.0002, "epoch": 7.947712418300654, "step": 12160}, {"loss": 0.8076, "grad_norm": 1.1070104837417603, "learning_rate": 0.0002, "epoch": 7.954248366013072, "step": 12170}, {"loss": 0.8191, "grad_norm": 1.3011643886566162, "learning_rate": 0.0002, "epoch": 7.96078431372549, "step": 12180}, {"loss": 0.7951, "grad_norm": 1.134848713874817, "learning_rate": 0.0002, "epoch": 7.967320261437909, "step": 12190}, {"loss": 0.9318, "grad_norm": 1.7021794319152832, "learning_rate": 0.0002, "epoch": 7.973856209150327, "step": 12200}, {"loss": 0.9159, "grad_norm": 1.0190330743789673, "learning_rate": 0.0002, "epoch": 7.980392156862745, "step": 12210}, {"loss": 0.9586, "grad_norm": 1.6083006858825684, "learning_rate": 0.0002, "epoch": 7.9869281045751634, "step": 12220}, {"loss": 0.915, "grad_norm": 0.8929536938667297, "learning_rate": 0.0002, "epoch": 7.993464052287582, "step": 12230}, {"loss": 0.8706, "grad_norm": 0.9928004145622253, "learning_rate": 0.0002, "epoch": 8.0, "step": 12240}]}