diff --git a/.gitattributes b/.gitattributes index 5004930cb5655e4ae59473d80bd07245d91f85b6..b35593da2da780c0c3ac827a66630d73015a0af8 100644 --- a/.gitattributes +++ b/.gitattributes @@ -875,3 +875,12 @@ gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora- gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-7650/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/checkpoint-9180/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c79d70858ce055c6f1bb883cb35180f5113e2f20 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd71ca55a2af1b8facc7c698205b6599fe0e892ac0b224b8ae93a917b4ed891 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ccfaeeea929000371803ae9ca50bb1c7cb765a56 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452c77583957da6b4293899c95b5be9701d7480c4d5a2b0f2fa68c4fb0b9731b +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..506d13db3f66eaa07cd88dde5a9dab148d81affa --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3b9dc5674e22ec76147af6a69b9bbabc1f0e2f49564c5b73267a13abf7667b +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e1c8f330a7ead0c006626ce05530312ba87cf1f --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ac61707428e46dc6875dc3f37d3153002d158b4d52f392a06aae074add7ec80 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..465ee81cfb83902b3701aefcc3a37d798ae3014f --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b06ab96c3fe86bdf5decd66ad11232808aad800149c581e0ee95fa30ae0ec6 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..41f964935b2ae34fb55e28a330bf42fc43159ddb --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/trainer_state.json @@ -0,0 +1,7825 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 7.997104596453131, + "eval_steps": 10, + "global_step": 11048, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + }, + { + "epoch": 3.0039811798769454, + "grad_norm": 0.7043836116790771, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 4150 + }, + { + "epoch": 3.0112196887441187, + "grad_norm": 0.6806283593177795, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4160 + }, + { + "epoch": 3.018458197611292, + "grad_norm": 0.7684550285339355, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 4170 + }, + { + "epoch": 3.0256967064784654, + "grad_norm": 0.7895237803459167, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4180 + }, + { + "epoch": 3.032935215345639, + "grad_norm": 0.7464531064033508, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 4190 + }, + { + "epoch": 3.040173724212812, + "grad_norm": 0.9358500838279724, + "learning_rate": 0.0002, + "loss": 1.1614, + "step": 4200 + }, + { + "epoch": 3.0474122330799855, + "grad_norm": 1.1066628694534302, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 4210 + }, + { + "epoch": 3.054650741947159, + "grad_norm": 0.6663267612457275, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 4220 + }, + { + "epoch": 3.0618892508143323, + "grad_norm": 0.6669464707374573, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 4230 + }, + { + "epoch": 3.0691277596815056, + "grad_norm": 0.7052164077758789, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 4240 + }, + { + "epoch": 3.076366268548679, + "grad_norm": 0.6118432879447937, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4250 + }, + { + "epoch": 3.0836047774158524, + "grad_norm": 0.6915903687477112, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 4260 + }, + { + "epoch": 3.0908432862830257, + "grad_norm": 0.7441644668579102, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4270 + }, + { + "epoch": 3.098081795150199, + "grad_norm": 0.823850691318512, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4280 + }, + { + "epoch": 3.1053203040173725, + "grad_norm": 0.9677883386611938, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 4290 + }, + { + "epoch": 3.112558812884546, + "grad_norm": 0.7002579569816589, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 4300 + }, + { + "epoch": 3.119797321751719, + "grad_norm": 0.778789758682251, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 4310 + }, + { + "epoch": 3.1270358306188926, + "grad_norm": 0.7236007452011108, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 4320 + }, + { + "epoch": 3.134274339486066, + "grad_norm": 0.8809133768081665, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 4330 + }, + { + "epoch": 3.1415128483532393, + "grad_norm": 0.7924913167953491, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 3.1487513572204127, + "grad_norm": 0.7437422275543213, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4350 + }, + { + "epoch": 3.155989866087586, + "grad_norm": 0.6428450345993042, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 4360 + }, + { + "epoch": 3.1632283749547594, + "grad_norm": 0.7922873497009277, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 4370 + }, + { + "epoch": 3.1704668838219328, + "grad_norm": 0.5252506732940674, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 4380 + }, + { + "epoch": 3.177705392689106, + "grad_norm": 0.8570457696914673, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 4390 + }, + { + "epoch": 3.1849439015562795, + "grad_norm": 0.7218987345695496, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 4400 + }, + { + "epoch": 3.192182410423453, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4410 + }, + { + "epoch": 3.199420919290626, + "grad_norm": 0.7386137843132019, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 4420 + }, + { + "epoch": 3.2066594281577996, + "grad_norm": 0.6227759122848511, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 4430 + }, + { + "epoch": 3.213897937024973, + "grad_norm": 0.7180278897285461, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 4440 + }, + { + "epoch": 3.2211364458921463, + "grad_norm": 0.745830774307251, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 4450 + }, + { + "epoch": 3.2283749547593197, + "grad_norm": 0.6766072511672974, + "learning_rate": 0.0002, + "loss": 1.234, + "step": 4460 + }, + { + "epoch": 3.235613463626493, + "grad_norm": 0.8325067162513733, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 4470 + }, + { + "epoch": 3.2428519724936664, + "grad_norm": 0.7148305177688599, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 4480 + }, + { + "epoch": 3.25009048136084, + "grad_norm": 0.7752676010131836, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 4490 + }, + { + "epoch": 3.257328990228013, + "grad_norm": 0.6776860952377319, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4500 + }, + { + "epoch": 3.2645674990951865, + "grad_norm": 0.704359769821167, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4510 + }, + { + "epoch": 3.27180600796236, + "grad_norm": 0.6880282163619995, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 4520 + }, + { + "epoch": 3.2790445168295332, + "grad_norm": 0.8179270029067993, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 4530 + }, + { + "epoch": 3.2862830256967066, + "grad_norm": 0.6718448996543884, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 4540 + }, + { + "epoch": 3.29352153456388, + "grad_norm": 0.8300657868385315, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4550 + }, + { + "epoch": 3.3007600434310533, + "grad_norm": 0.6433690786361694, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 4560 + }, + { + "epoch": 3.3079985522982267, + "grad_norm": 0.690262496471405, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4570 + }, + { + "epoch": 3.3152370611654, + "grad_norm": 0.7022852301597595, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 4580 + }, + { + "epoch": 3.3224755700325734, + "grad_norm": 0.6438387632369995, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 4590 + }, + { + "epoch": 3.329714078899747, + "grad_norm": 0.6866899132728577, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 4600 + }, + { + "epoch": 3.33695258776692, + "grad_norm": 0.8233968019485474, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 4610 + }, + { + "epoch": 3.3441910966340935, + "grad_norm": 0.7251574993133545, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 4620 + }, + { + "epoch": 3.351429605501267, + "grad_norm": 0.7855110168457031, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4630 + }, + { + "epoch": 3.3586681143684403, + "grad_norm": 0.8487356305122375, + "learning_rate": 0.0002, + "loss": 1.2922, + "step": 4640 + }, + { + "epoch": 3.3659066232356136, + "grad_norm": 0.6429011225700378, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 4650 + }, + { + "epoch": 3.373145132102787, + "grad_norm": 0.7095270156860352, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4660 + }, + { + "epoch": 3.3803836409699604, + "grad_norm": 0.6792303323745728, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4670 + }, + { + "epoch": 3.3876221498371337, + "grad_norm": 0.6784825921058655, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 4680 + }, + { + "epoch": 3.394860658704307, + "grad_norm": 0.6362888216972351, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 4690 + }, + { + "epoch": 3.4020991675714805, + "grad_norm": 0.7794778943061829, + "learning_rate": 0.0002, + "loss": 1.2165, + "step": 4700 + }, + { + "epoch": 3.409337676438654, + "grad_norm": 0.7287485003471375, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 4710 + }, + { + "epoch": 3.416576185305827, + "grad_norm": 0.6481451392173767, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 4720 + }, + { + "epoch": 3.4238146941730006, + "grad_norm": 0.9200371503829956, + "learning_rate": 0.0002, + "loss": 1.2121, + "step": 4730 + }, + { + "epoch": 3.431053203040174, + "grad_norm": 1.074180245399475, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 4740 + }, + { + "epoch": 3.438291711907347, + "grad_norm": 0.6722986698150635, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4750 + }, + { + "epoch": 3.44553022077452, + "grad_norm": 0.7945933938026428, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 4760 + }, + { + "epoch": 3.4527687296416936, + "grad_norm": 0.7624640464782715, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 4770 + }, + { + "epoch": 3.460007238508867, + "grad_norm": 0.7763656377792358, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 4780 + }, + { + "epoch": 3.4672457473760403, + "grad_norm": 0.7736947536468506, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 4790 + }, + { + "epoch": 3.4744842562432137, + "grad_norm": 0.8450354933738708, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 4800 + }, + { + "epoch": 3.481722765110387, + "grad_norm": 0.6480133533477783, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 4810 + }, + { + "epoch": 3.4889612739775604, + "grad_norm": 0.8437445759773254, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 4820 + }, + { + "epoch": 3.4961997828447338, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 4830 + }, + { + "epoch": 3.503438291711907, + "grad_norm": 0.8523228168487549, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 4840 + }, + { + "epoch": 3.5106768005790805, + "grad_norm": 0.6236732006072998, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4850 + }, + { + "epoch": 3.517915309446254, + "grad_norm": 0.7500787377357483, + "learning_rate": 0.0002, + "loss": 1.1926, + "step": 4860 + }, + { + "epoch": 3.5251538183134272, + "grad_norm": 0.7665374875068665, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 4870 + }, + { + "epoch": 3.5323923271806006, + "grad_norm": 0.787857711315155, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 4880 + }, + { + "epoch": 3.539630836047774, + "grad_norm": 0.970595121383667, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4890 + }, + { + "epoch": 3.5468693449149473, + "grad_norm": 0.6409347057342529, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 4900 + }, + { + "epoch": 3.5541078537821207, + "grad_norm": 0.888551652431488, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4910 + }, + { + "epoch": 3.561346362649294, + "grad_norm": 1.0808377265930176, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 4920 + }, + { + "epoch": 3.5685848715164674, + "grad_norm": 0.7501053214073181, + "learning_rate": 0.0002, + "loss": 1.2564, + "step": 4930 + }, + { + "epoch": 3.575823380383641, + "grad_norm": 0.7375240325927734, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 4940 + }, + { + "epoch": 3.583061889250814, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 4950 + }, + { + "epoch": 3.5903003981179875, + "grad_norm": 0.939337432384491, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 4960 + }, + { + "epoch": 3.597538906985161, + "grad_norm": 0.6717396974563599, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 4970 + }, + { + "epoch": 3.6047774158523342, + "grad_norm": 0.7141643762588501, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4980 + }, + { + "epoch": 3.6120159247195076, + "grad_norm": 0.7109216451644897, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 4990 + }, + { + "epoch": 3.619254433586681, + "grad_norm": 0.7020776867866516, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 5000 + }, + { + "epoch": 3.6264929424538543, + "grad_norm": 0.7158873677253723, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5010 + }, + { + "epoch": 3.6337314513210277, + "grad_norm": 0.7062035202980042, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 5020 + }, + { + "epoch": 3.640969960188201, + "grad_norm": 0.7081155776977539, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5030 + }, + { + "epoch": 3.6482084690553744, + "grad_norm": 1.2210607528686523, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 5040 + }, + { + "epoch": 3.655446977922548, + "grad_norm": 0.6650236248970032, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5050 + }, + { + "epoch": 3.662685486789721, + "grad_norm": 0.6884829998016357, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 5060 + }, + { + "epoch": 3.6699239956568945, + "grad_norm": 0.7317819595336914, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5070 + }, + { + "epoch": 3.677162504524068, + "grad_norm": 0.7406691908836365, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 5080 + }, + { + "epoch": 3.6844010133912413, + "grad_norm": 0.9009454250335693, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 5090 + }, + { + "epoch": 3.6916395222584146, + "grad_norm": 0.8189385533332825, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 5100 + }, + { + "epoch": 3.698878031125588, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 5110 + }, + { + "epoch": 3.7061165399927614, + "grad_norm": 0.8593027591705322, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5120 + }, + { + "epoch": 3.7133550488599347, + "grad_norm": 0.8481812477111816, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 5130 + }, + { + "epoch": 3.720593557727108, + "grad_norm": 0.6527451276779175, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 5140 + }, + { + "epoch": 3.7278320665942815, + "grad_norm": 0.9220114350318909, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 5150 + }, + { + "epoch": 3.735070575461455, + "grad_norm": 1.0842019319534302, + "learning_rate": 0.0002, + "loss": 1.2267, + "step": 5160 + }, + { + "epoch": 3.742309084328628, + "grad_norm": 0.965453565120697, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 5170 + }, + { + "epoch": 3.7495475931958016, + "grad_norm": 0.9903319478034973, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 5180 + }, + { + "epoch": 3.756786102062975, + "grad_norm": 0.7434818148612976, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 5190 + }, + { + "epoch": 3.7640246109301483, + "grad_norm": 0.6717280745506287, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 5200 + }, + { + "epoch": 3.7712631197973217, + "grad_norm": 0.7754665613174438, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5210 + }, + { + "epoch": 3.778501628664495, + "grad_norm": 1.028374433517456, + "learning_rate": 0.0002, + "loss": 1.305, + "step": 5220 + }, + { + "epoch": 3.7857401375316684, + "grad_norm": 0.6026996374130249, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5230 + }, + { + "epoch": 3.7929786463988417, + "grad_norm": 0.6978490948677063, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 5240 + }, + { + "epoch": 3.800217155266015, + "grad_norm": 0.7303446531295776, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 5250 + }, + { + "epoch": 3.8074556641331885, + "grad_norm": 1.0734210014343262, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 5260 + }, + { + "epoch": 3.814694173000362, + "grad_norm": 0.6383201479911804, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 5270 + }, + { + "epoch": 3.821932681867535, + "grad_norm": 0.7742630243301392, + "learning_rate": 0.0002, + "loss": 1.0904, + "step": 5280 + }, + { + "epoch": 3.8291711907347086, + "grad_norm": 0.8477074503898621, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 5290 + }, + { + "epoch": 3.836409699601882, + "grad_norm": 0.6675317883491516, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 5300 + }, + { + "epoch": 3.8436482084690553, + "grad_norm": 0.7515445351600647, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 5310 + }, + { + "epoch": 3.8508867173362287, + "grad_norm": 1.1441220045089722, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 5320 + }, + { + "epoch": 3.858125226203402, + "grad_norm": 0.7968795895576477, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 5330 + }, + { + "epoch": 3.8653637350705754, + "grad_norm": 0.7842824459075928, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 5340 + }, + { + "epoch": 3.8726022439377488, + "grad_norm": 0.8272225260734558, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5350 + }, + { + "epoch": 3.879840752804922, + "grad_norm": 0.8413397669792175, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 5360 + }, + { + "epoch": 3.8870792616720955, + "grad_norm": 1.141764760017395, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5370 + }, + { + "epoch": 3.894317770539269, + "grad_norm": 0.9826975464820862, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5380 + }, + { + "epoch": 3.9015562794064422, + "grad_norm": 0.8598255515098572, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5390 + }, + { + "epoch": 3.9087947882736156, + "grad_norm": 0.6271058320999146, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 5400 + }, + { + "epoch": 3.916033297140789, + "grad_norm": 0.6379870772361755, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5410 + }, + { + "epoch": 3.9232718060079623, + "grad_norm": 1.0313376188278198, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 5420 + }, + { + "epoch": 3.9305103148751357, + "grad_norm": 0.8220619559288025, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 5430 + }, + { + "epoch": 3.937748823742309, + "grad_norm": 0.7576116919517517, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5440 + }, + { + "epoch": 3.9449873326094824, + "grad_norm": 1.226235032081604, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 5450 + }, + { + "epoch": 3.952225841476656, + "grad_norm": 0.7979229688644409, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5460 + }, + { + "epoch": 3.959464350343829, + "grad_norm": 0.9911929965019226, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 5470 + }, + { + "epoch": 3.9667028592110025, + "grad_norm": 0.643738865852356, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5480 + }, + { + "epoch": 3.973941368078176, + "grad_norm": 0.682305634021759, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 5490 + }, + { + "epoch": 3.9811798769453492, + "grad_norm": 1.18373441696167, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 5500 + }, + { + "epoch": 3.9884183858125226, + "grad_norm": 0.7190203070640564, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5510 + }, + { + "epoch": 3.995656894679696, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 5520 + }, + { + "epoch": 4.0, + "eval_loss": 1.4252897500991821, + "eval_runtime": 27.235, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 5526 + }, + { + "epoch": 4.002895403546869, + "grad_norm": 0.6353074312210083, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 5530 + }, + { + "epoch": 4.010133912414043, + "grad_norm": 0.7424906492233276, + "learning_rate": 0.0002, + "loss": 1.0326, + "step": 5540 + }, + { + "epoch": 4.017372421281216, + "grad_norm": 0.8856638073921204, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5550 + }, + { + "epoch": 4.024610930148389, + "grad_norm": 0.9627974033355713, + "learning_rate": 0.0002, + "loss": 1.0905, + "step": 5560 + }, + { + "epoch": 4.031849439015563, + "grad_norm": 0.9048978686332703, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5570 + }, + { + "epoch": 4.039087947882736, + "grad_norm": 0.921119213104248, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 5580 + }, + { + "epoch": 4.0463264567499095, + "grad_norm": 0.8654361963272095, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 5590 + }, + { + "epoch": 4.053564965617083, + "grad_norm": 0.7947945594787598, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 5600 + }, + { + "epoch": 4.060803474484256, + "grad_norm": 0.8307326436042786, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 5610 + }, + { + "epoch": 4.06804198335143, + "grad_norm": 0.793273389339447, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 5620 + }, + { + "epoch": 4.075280492218603, + "grad_norm": 0.8748673796653748, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.082519001085776, + "grad_norm": 0.7926856279373169, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 5640 + }, + { + "epoch": 4.08975750995295, + "grad_norm": 0.922645092010498, + "learning_rate": 0.0002, + "loss": 1.044, + "step": 5650 + }, + { + "epoch": 4.096996018820123, + "grad_norm": 0.9539641737937927, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5660 + }, + { + "epoch": 4.1042345276872965, + "grad_norm": 0.8674443364143372, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 5670 + }, + { + "epoch": 4.11147303655447, + "grad_norm": 0.7097609043121338, + "learning_rate": 0.0002, + "loss": 0.9867, + "step": 5680 + }, + { + "epoch": 4.118711545421643, + "grad_norm": 0.8875522613525391, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 5690 + }, + { + "epoch": 4.125950054288817, + "grad_norm": 0.8583634495735168, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 5700 + }, + { + "epoch": 4.13318856315599, + "grad_norm": 0.6736377477645874, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 5710 + }, + { + "epoch": 4.140427072023163, + "grad_norm": 0.9349062442779541, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 5720 + }, + { + "epoch": 4.147665580890337, + "grad_norm": 1.0610365867614746, + "learning_rate": 0.0002, + "loss": 1.0508, + "step": 5730 + }, + { + "epoch": 4.15490408975751, + "grad_norm": 1.5838189125061035, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 5740 + }, + { + "epoch": 4.162142598624683, + "grad_norm": 0.747522234916687, + "learning_rate": 0.0002, + "loss": 1.0222, + "step": 5750 + }, + { + "epoch": 4.169381107491857, + "grad_norm": 1.3247915506362915, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 5760 + }, + { + "epoch": 4.17661961635903, + "grad_norm": 0.8750247955322266, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 5770 + }, + { + "epoch": 4.1838581252262035, + "grad_norm": 0.7914144992828369, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 5780 + }, + { + "epoch": 4.191096634093377, + "grad_norm": 0.9493299126625061, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 5790 + }, + { + "epoch": 4.19833514296055, + "grad_norm": 0.7802295088768005, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 5800 + }, + { + "epoch": 4.205573651827724, + "grad_norm": 0.6987314820289612, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 5810 + }, + { + "epoch": 4.212812160694897, + "grad_norm": 0.9220341444015503, + "learning_rate": 0.0002, + "loss": 1.1699, + "step": 5820 + }, + { + "epoch": 4.22005066956207, + "grad_norm": 0.8932939767837524, + "learning_rate": 0.0002, + "loss": 1.1394, + "step": 5830 + }, + { + "epoch": 4.227289178429244, + "grad_norm": 0.920002818107605, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 5840 + }, + { + "epoch": 4.234527687296417, + "grad_norm": 0.6662752032279968, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 5850 + }, + { + "epoch": 4.24176619616359, + "grad_norm": 0.8679718971252441, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 5860 + }, + { + "epoch": 4.249004705030764, + "grad_norm": 0.7020887732505798, + "learning_rate": 0.0002, + "loss": 0.8991, + "step": 5870 + }, + { + "epoch": 4.256243213897937, + "grad_norm": 0.869611382484436, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 5880 + }, + { + "epoch": 4.2634817227651105, + "grad_norm": 0.7796585559844971, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 5890 + }, + { + "epoch": 4.270720231632284, + "grad_norm": 0.8978819251060486, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 5900 + }, + { + "epoch": 4.277958740499457, + "grad_norm": 1.0837205648422241, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 5910 + }, + { + "epoch": 4.285197249366631, + "grad_norm": 0.7584353089332581, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5920 + }, + { + "epoch": 4.292435758233804, + "grad_norm": 0.7313185334205627, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5930 + }, + { + "epoch": 4.299674267100977, + "grad_norm": 0.8004671335220337, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 5940 + }, + { + "epoch": 4.306912775968151, + "grad_norm": 2.154958724975586, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 5950 + }, + { + "epoch": 4.314151284835324, + "grad_norm": 0.9163479804992676, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 5960 + }, + { + "epoch": 4.321389793702497, + "grad_norm": 0.9151589274406433, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 5970 + }, + { + "epoch": 4.328628302569671, + "grad_norm": 0.8624112010002136, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 5980 + }, + { + "epoch": 4.335866811436844, + "grad_norm": 0.9357741475105286, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 5990 + }, + { + "epoch": 4.3431053203040175, + "grad_norm": 1.3482335805892944, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 6000 + }, + { + "epoch": 4.350343829171191, + "grad_norm": 0.7156149744987488, + "learning_rate": 0.0002, + "loss": 1.1224, + "step": 6010 + }, + { + "epoch": 4.357582338038364, + "grad_norm": 0.8480049967765808, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6020 + }, + { + "epoch": 4.364820846905538, + "grad_norm": 0.8262244462966919, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 6030 + }, + { + "epoch": 4.372059355772711, + "grad_norm": 0.7733905911445618, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 6040 + }, + { + "epoch": 4.379297864639884, + "grad_norm": 0.8553919792175293, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 6050 + }, + { + "epoch": 4.386536373507058, + "grad_norm": 0.8666832447052002, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 6060 + }, + { + "epoch": 4.393774882374231, + "grad_norm": 0.9168295860290527, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6070 + }, + { + "epoch": 4.4010133912414044, + "grad_norm": 0.7315238118171692, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 6080 + }, + { + "epoch": 4.408251900108578, + "grad_norm": 1.020263433456421, + "learning_rate": 0.0002, + "loss": 1.1599, + "step": 6090 + }, + { + "epoch": 4.415490408975751, + "grad_norm": 0.9978243708610535, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 6100 + }, + { + "epoch": 4.4227289178429245, + "grad_norm": 0.995453953742981, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 6110 + }, + { + "epoch": 4.429967426710098, + "grad_norm": 0.9360884428024292, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6120 + }, + { + "epoch": 4.437205935577271, + "grad_norm": 0.8099448084831238, + "learning_rate": 0.0002, + "loss": 0.9506, + "step": 6130 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8173841238021851, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 6140 + }, + { + "epoch": 4.451682953311618, + "grad_norm": 0.7972666025161743, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 6150 + }, + { + "epoch": 4.458921462178791, + "grad_norm": 0.7685779333114624, + "learning_rate": 0.0002, + "loss": 1.0226, + "step": 6160 + }, + { + "epoch": 4.466159971045965, + "grad_norm": 0.7872623801231384, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6170 + }, + { + "epoch": 4.473398479913138, + "grad_norm": 0.7677070498466492, + "learning_rate": 0.0002, + "loss": 0.9911, + "step": 6180 + }, + { + "epoch": 4.4806369887803115, + "grad_norm": 0.7878316044807434, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 6190 + }, + { + "epoch": 4.487875497647485, + "grad_norm": 0.8178079724311829, + "learning_rate": 0.0002, + "loss": 1.018, + "step": 6200 + }, + { + "epoch": 4.495114006514658, + "grad_norm": 1.2820082902908325, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 6210 + }, + { + "epoch": 4.502352515381832, + "grad_norm": 0.9380832314491272, + "learning_rate": 0.0002, + "loss": 1.3101, + "step": 6220 + }, + { + "epoch": 4.509591024249005, + "grad_norm": 0.7810422778129578, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 6230 + }, + { + "epoch": 4.516829533116178, + "grad_norm": 1.1022917032241821, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6240 + }, + { + "epoch": 4.524068041983352, + "grad_norm": 1.4275553226470947, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 6250 + }, + { + "epoch": 4.531306550850525, + "grad_norm": 0.7597777247428894, + "learning_rate": 0.0002, + "loss": 1.3237, + "step": 6260 + }, + { + "epoch": 4.538545059717698, + "grad_norm": 1.10992431640625, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 6270 + }, + { + "epoch": 4.545783568584872, + "grad_norm": 0.8981178998947144, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6280 + }, + { + "epoch": 4.553022077452045, + "grad_norm": 0.7863979339599609, + "learning_rate": 0.0002, + "loss": 1.086, + "step": 6290 + }, + { + "epoch": 4.5602605863192185, + "grad_norm": 0.9071474671363831, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 6300 + }, + { + "epoch": 4.567499095186392, + "grad_norm": 0.7429424524307251, + "learning_rate": 0.0002, + "loss": 1.0916, + "step": 6310 + }, + { + "epoch": 4.574737604053565, + "grad_norm": 1.0767850875854492, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 6320 + }, + { + "epoch": 4.581976112920739, + "grad_norm": 0.7885915637016296, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 6330 + }, + { + "epoch": 4.589214621787912, + "grad_norm": 0.8350457549095154, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 6340 + }, + { + "epoch": 4.596453130655085, + "grad_norm": 0.7853530645370483, + "learning_rate": 0.0002, + "loss": 1.0743, + "step": 6350 + }, + { + "epoch": 4.603691639522259, + "grad_norm": 1.1220661401748657, + "learning_rate": 0.0002, + "loss": 1.1912, + "step": 6360 + }, + { + "epoch": 4.610930148389432, + "grad_norm": 0.7959423065185547, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 6370 + }, + { + "epoch": 4.618168657256605, + "grad_norm": 0.7782652378082275, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6380 + }, + { + "epoch": 4.625407166123779, + "grad_norm": 0.7882203459739685, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6390 + }, + { + "epoch": 4.632645674990952, + "grad_norm": 0.8841899037361145, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 6400 + }, + { + "epoch": 4.6398841838581255, + "grad_norm": 0.7936127781867981, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 6410 + }, + { + "epoch": 4.647122692725299, + "grad_norm": 0.9213966131210327, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 6420 + }, + { + "epoch": 4.654361201592472, + "grad_norm": 0.9246473908424377, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 6430 + }, + { + "epoch": 4.661599710459646, + "grad_norm": 0.766572892665863, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 6440 + }, + { + "epoch": 4.668838219326819, + "grad_norm": 0.8596171736717224, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 6450 + }, + { + "epoch": 4.676076728193992, + "grad_norm": 0.8482751846313477, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6460 + }, + { + "epoch": 4.683315237061166, + "grad_norm": 1.0826905965805054, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 6470 + }, + { + "epoch": 4.690553745928339, + "grad_norm": 1.1048457622528076, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 6480 + }, + { + "epoch": 4.697792254795512, + "grad_norm": 0.9429134726524353, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6490 + }, + { + "epoch": 4.705030763662686, + "grad_norm": 0.8587502837181091, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 6500 + }, + { + "epoch": 4.712269272529859, + "grad_norm": 1.0387083292007446, + "learning_rate": 0.0002, + "loss": 1.0969, + "step": 6510 + }, + { + "epoch": 4.7195077813970325, + "grad_norm": 0.7471951842308044, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 6520 + }, + { + "epoch": 4.726746290264206, + "grad_norm": 0.8800424933433533, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6530 + }, + { + "epoch": 4.733984799131379, + "grad_norm": 0.8136811852455139, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 6540 + }, + { + "epoch": 4.741223307998553, + "grad_norm": 0.9910339713096619, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 6550 + }, + { + "epoch": 4.748461816865726, + "grad_norm": 1.0679163932800293, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 6560 + }, + { + "epoch": 4.755700325732899, + "grad_norm": 0.8468248248100281, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 6570 + }, + { + "epoch": 4.762938834600073, + "grad_norm": 0.8771235942840576, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 6580 + }, + { + "epoch": 4.770177343467246, + "grad_norm": 0.7024846076965332, + "learning_rate": 0.0002, + "loss": 1.077, + "step": 6590 + }, + { + "epoch": 4.7774158523344195, + "grad_norm": 0.7836683392524719, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 6600 + }, + { + "epoch": 4.784654361201593, + "grad_norm": 0.7717288136482239, + "learning_rate": 0.0002, + "loss": 1.1006, + "step": 6610 + }, + { + "epoch": 4.791892870068766, + "grad_norm": 0.884183943271637, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 6620 + }, + { + "epoch": 4.7991313789359396, + "grad_norm": 1.383867621421814, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 6630 + }, + { + "epoch": 4.806369887803113, + "grad_norm": 0.9741523861885071, + "learning_rate": 0.0002, + "loss": 1.0861, + "step": 6640 + }, + { + "epoch": 4.813608396670286, + "grad_norm": 0.9723693132400513, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 6650 + }, + { + "epoch": 4.82084690553746, + "grad_norm": 1.8324809074401855, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 6660 + }, + { + "epoch": 4.828085414404633, + "grad_norm": 0.904909074306488, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 6670 + }, + { + "epoch": 4.835323923271806, + "grad_norm": 0.7355411648750305, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 6680 + }, + { + "epoch": 4.84256243213898, + "grad_norm": 0.8934960961341858, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 6690 + }, + { + "epoch": 4.849800941006153, + "grad_norm": 1.4596954584121704, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6700 + }, + { + "epoch": 4.8570394498733265, + "grad_norm": 0.8310341238975525, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 6710 + }, + { + "epoch": 4.8642779587405, + "grad_norm": 0.9709894061088562, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 6720 + }, + { + "epoch": 4.871516467607673, + "grad_norm": 0.852142333984375, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 6730 + }, + { + "epoch": 4.878754976474847, + "grad_norm": 1.0643625259399414, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 6740 + }, + { + "epoch": 4.88599348534202, + "grad_norm": 0.9419508576393127, + "learning_rate": 0.0002, + "loss": 1.056, + "step": 6750 + }, + { + "epoch": 4.893231994209193, + "grad_norm": 1.1818498373031616, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 6760 + }, + { + "epoch": 4.900470503076367, + "grad_norm": 0.9369569420814514, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 6770 + }, + { + "epoch": 4.90770901194354, + "grad_norm": 0.7012579441070557, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 6780 + }, + { + "epoch": 4.914947520810713, + "grad_norm": 0.9109319448471069, + "learning_rate": 0.0002, + "loss": 1.0926, + "step": 6790 + }, + { + "epoch": 4.922186029677887, + "grad_norm": 0.8077534437179565, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 6800 + }, + { + "epoch": 4.92942453854506, + "grad_norm": 0.7571148872375488, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 6810 + }, + { + "epoch": 4.9366630474122335, + "grad_norm": 0.7325633764266968, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 6820 + }, + { + "epoch": 4.943901556279407, + "grad_norm": 0.8465084433555603, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 6830 + }, + { + "epoch": 4.95114006514658, + "grad_norm": 0.8753737807273865, + "learning_rate": 0.0002, + "loss": 1.153, + "step": 6840 + }, + { + "epoch": 4.958378574013754, + "grad_norm": 0.9421748518943787, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 6850 + }, + { + "epoch": 4.965617082880927, + "grad_norm": 0.8245896697044373, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 6860 + }, + { + "epoch": 4.9728555917481, + "grad_norm": 0.8823089599609375, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 6870 + }, + { + "epoch": 4.980094100615274, + "grad_norm": 0.8406389355659485, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 6880 + }, + { + "epoch": 4.987332609482447, + "grad_norm": 0.9732868075370789, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 6890 + }, + { + "epoch": 4.99457111834962, + "grad_norm": 2.125141143798828, + "learning_rate": 0.0002, + "loss": 1.1776, + "step": 6900 + }, + { + "epoch": 4.999638074556641, + "eval_loss": 1.445176601409912, + "eval_runtime": 27.2351, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 6907 + }, + { + "epoch": 5.001809627216793, + "grad_norm": 0.9465792775154114, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 6910 + }, + { + "epoch": 5.009048136083966, + "grad_norm": 1.2834891080856323, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 6920 + }, + { + "epoch": 5.01628664495114, + "grad_norm": 1.0297378301620483, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 6930 + }, + { + "epoch": 5.023525153818313, + "grad_norm": 1.1705161333084106, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 6940 + }, + { + "epoch": 5.030763662685486, + "grad_norm": 0.8293961882591248, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 6950 + }, + { + "epoch": 5.03800217155266, + "grad_norm": 1.0422210693359375, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 6960 + }, + { + "epoch": 5.045240680419833, + "grad_norm": 1.116104245185852, + "learning_rate": 0.0002, + "loss": 1.0553, + "step": 6970 + }, + { + "epoch": 5.0524791892870065, + "grad_norm": 1.5118416547775269, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 6980 + }, + { + "epoch": 5.05971769815418, + "grad_norm": 0.8383979797363281, + "learning_rate": 0.0002, + "loss": 0.9969, + "step": 6990 + }, + { + "epoch": 5.066956207021353, + "grad_norm": 1.3378649950027466, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7000 + }, + { + "epoch": 5.0741947158885266, + "grad_norm": 1.1840510368347168, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 7010 + }, + { + "epoch": 5.0814332247557, + "grad_norm": 1.2354751825332642, + "learning_rate": 0.0002, + "loss": 0.9939, + "step": 7020 + }, + { + "epoch": 5.088671733622873, + "grad_norm": 1.3830451965332031, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 7030 + }, + { + "epoch": 5.095910242490047, + "grad_norm": 0.8101674318313599, + "learning_rate": 0.0002, + "loss": 1.1827, + "step": 7040 + }, + { + "epoch": 5.10314875135722, + "grad_norm": 0.897982656955719, + "learning_rate": 0.0002, + "loss": 0.9255, + "step": 7050 + }, + { + "epoch": 5.110387260224393, + "grad_norm": 1.2049678564071655, + "learning_rate": 0.0002, + "loss": 0.8784, + "step": 7060 + }, + { + "epoch": 5.117625769091567, + "grad_norm": 1.5912116765975952, + "learning_rate": 0.0002, + "loss": 1.0182, + "step": 7070 + }, + { + "epoch": 5.12486427795874, + "grad_norm": 0.9261530041694641, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 7080 + }, + { + "epoch": 5.1321027868259135, + "grad_norm": 1.1454812288284302, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 7090 + }, + { + "epoch": 5.139341295693087, + "grad_norm": 1.0049978494644165, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 7100 + }, + { + "epoch": 5.14657980456026, + "grad_norm": 1.4513251781463623, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 7110 + }, + { + "epoch": 5.153818313427434, + "grad_norm": 0.9800849556922913, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 7120 + }, + { + "epoch": 5.161056822294607, + "grad_norm": 0.9698708653450012, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 7130 + }, + { + "epoch": 5.16829533116178, + "grad_norm": 1.1126646995544434, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 7140 + }, + { + "epoch": 5.175533840028954, + "grad_norm": 0.9248330593109131, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 7150 + }, + { + "epoch": 5.182772348896127, + "grad_norm": 0.7967255711555481, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 7160 + }, + { + "epoch": 5.1900108577633, + "grad_norm": 0.9933333992958069, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 7170 + }, + { + "epoch": 5.197249366630474, + "grad_norm": 1.0080649852752686, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7180 + }, + { + "epoch": 5.204487875497647, + "grad_norm": 1.3954921960830688, + "learning_rate": 0.0002, + "loss": 1.0201, + "step": 7190 + }, + { + "epoch": 5.2117263843648205, + "grad_norm": 1.2386271953582764, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 7200 + }, + { + "epoch": 5.218964893231994, + "grad_norm": 1.2379488945007324, + "learning_rate": 0.0002, + "loss": 0.8863, + "step": 7210 + }, + { + "epoch": 5.226203402099167, + "grad_norm": 0.9882503747940063, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 7220 + }, + { + "epoch": 5.233441910966341, + "grad_norm": 1.1728729009628296, + "learning_rate": 0.0002, + "loss": 0.9834, + "step": 7230 + }, + { + "epoch": 5.240680419833514, + "grad_norm": 0.9849673509597778, + "learning_rate": 0.0002, + "loss": 0.9269, + "step": 7240 + }, + { + "epoch": 5.247918928700687, + "grad_norm": 1.177639365196228, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 7250 + }, + { + "epoch": 5.255157437567861, + "grad_norm": 1.2395055294036865, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 7260 + }, + { + "epoch": 5.262395946435034, + "grad_norm": 1.3999171257019043, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 7270 + }, + { + "epoch": 5.269634455302207, + "grad_norm": 0.7698732018470764, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 7280 + }, + { + "epoch": 5.276872964169381, + "grad_norm": 0.9167453646659851, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 7290 + }, + { + "epoch": 5.284111473036554, + "grad_norm": 1.113830804824829, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 7300 + }, + { + "epoch": 5.2913499819037275, + "grad_norm": 0.9644396901130676, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 7310 + }, + { + "epoch": 5.298588490770901, + "grad_norm": 1.462435007095337, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7320 + }, + { + "epoch": 5.305826999638074, + "grad_norm": 0.9406287670135498, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 7330 + }, + { + "epoch": 5.313065508505248, + "grad_norm": 0.9698247909545898, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 7340 + }, + { + "epoch": 5.320304017372421, + "grad_norm": 1.12003755569458, + "learning_rate": 0.0002, + "loss": 0.915, + "step": 7350 + }, + { + "epoch": 5.327542526239594, + "grad_norm": 1.598681926727295, + "learning_rate": 0.0002, + "loss": 0.9838, + "step": 7360 + }, + { + "epoch": 5.334781035106768, + "grad_norm": 1.0450010299682617, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 7370 + }, + { + "epoch": 5.342019543973941, + "grad_norm": 0.8680008053779602, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 7380 + }, + { + "epoch": 5.349258052841114, + "grad_norm": 1.0115476846694946, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 7390 + }, + { + "epoch": 5.356496561708288, + "grad_norm": 0.9589748382568359, + "learning_rate": 0.0002, + "loss": 1.0702, + "step": 7400 + }, + { + "epoch": 5.363735070575461, + "grad_norm": 0.6729998588562012, + "learning_rate": 0.0002, + "loss": 0.9366, + "step": 7410 + }, + { + "epoch": 5.3709735794426345, + "grad_norm": 0.9246699213981628, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 7420 + }, + { + "epoch": 5.378212088309808, + "grad_norm": 1.1266791820526123, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 7430 + }, + { + "epoch": 5.385450597176981, + "grad_norm": 1.8056942224502563, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 7440 + }, + { + "epoch": 5.392689106044155, + "grad_norm": 0.9802932739257812, + "learning_rate": 0.0002, + "loss": 0.9604, + "step": 7450 + }, + { + "epoch": 5.399927614911328, + "grad_norm": 1.0504707098007202, + "learning_rate": 0.0002, + "loss": 0.9656, + "step": 7460 + }, + { + "epoch": 5.407166123778501, + "grad_norm": 1.1915022134780884, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 7470 + }, + { + "epoch": 5.414404632645675, + "grad_norm": 1.1856611967086792, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 7480 + }, + { + "epoch": 5.421643141512848, + "grad_norm": 1.292152762413025, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 7490 + }, + { + "epoch": 5.4288816503800215, + "grad_norm": 1.2675740718841553, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7500 + }, + { + "epoch": 5.436120159247195, + "grad_norm": 1.4034695625305176, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7510 + }, + { + "epoch": 5.443358668114368, + "grad_norm": 0.984588623046875, + "learning_rate": 0.0002, + "loss": 1.0318, + "step": 7520 + }, + { + "epoch": 5.450597176981542, + "grad_norm": 0.8419108390808105, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 7530 + }, + { + "epoch": 5.457835685848715, + "grad_norm": 1.0270143747329712, + "learning_rate": 0.0002, + "loss": 1.0499, + "step": 7540 + }, + { + "epoch": 5.465074194715888, + "grad_norm": 2.2158689498901367, + "learning_rate": 0.0002, + "loss": 0.9804, + "step": 7550 + }, + { + "epoch": 5.472312703583062, + "grad_norm": 1.0740524530410767, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 7560 + }, + { + "epoch": 5.479551212450235, + "grad_norm": 1.3804482221603394, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7570 + }, + { + "epoch": 5.486789721317408, + "grad_norm": 0.9428979754447937, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 7580 + }, + { + "epoch": 5.494028230184582, + "grad_norm": 0.9548295736312866, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7590 + }, + { + "epoch": 5.501266739051755, + "grad_norm": 1.0691065788269043, + "learning_rate": 0.0002, + "loss": 0.8853, + "step": 7600 + }, + { + "epoch": 5.5085052479189285, + "grad_norm": 1.0987380743026733, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 7610 + }, + { + "epoch": 5.515743756786102, + "grad_norm": 0.9483979344367981, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 7620 + }, + { + "epoch": 5.522982265653275, + "grad_norm": 1.16624915599823, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 7630 + }, + { + "epoch": 5.530220774520449, + "grad_norm": 0.8563777208328247, + "learning_rate": 0.0002, + "loss": 0.8695, + "step": 7640 + }, + { + "epoch": 5.537459283387622, + "grad_norm": 1.268186092376709, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 7650 + }, + { + "epoch": 5.544697792254795, + "grad_norm": 1.0752092599868774, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 7660 + }, + { + "epoch": 5.551936301121969, + "grad_norm": 1.210389256477356, + "learning_rate": 0.0002, + "loss": 0.9344, + "step": 7670 + }, + { + "epoch": 5.559174809989142, + "grad_norm": 1.669063925743103, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 7680 + }, + { + "epoch": 5.566413318856315, + "grad_norm": 1.038020133972168, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 7690 + }, + { + "epoch": 5.573651827723489, + "grad_norm": 1.316673994064331, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 7700 + }, + { + "epoch": 5.580890336590662, + "grad_norm": 1.029935359954834, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 7710 + }, + { + "epoch": 5.5881288454578355, + "grad_norm": 0.9401940703392029, + "learning_rate": 0.0002, + "loss": 1.0409, + "step": 7720 + }, + { + "epoch": 5.595367354325009, + "grad_norm": 2.4811816215515137, + "learning_rate": 0.0002, + "loss": 0.9272, + "step": 7730 + }, + { + "epoch": 5.602605863192182, + "grad_norm": 1.0329105854034424, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 7740 + }, + { + "epoch": 5.609844372059356, + "grad_norm": 1.479629635810852, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 7750 + }, + { + "epoch": 5.617082880926529, + "grad_norm": 1.9232319593429565, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 7760 + }, + { + "epoch": 5.624321389793702, + "grad_norm": 1.0055509805679321, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7770 + }, + { + "epoch": 5.631559898660876, + "grad_norm": 1.0037437677383423, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 7780 + }, + { + "epoch": 5.638798407528049, + "grad_norm": 1.4245030879974365, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 7790 + }, + { + "epoch": 5.646036916395222, + "grad_norm": 1.080687403678894, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 7800 + }, + { + "epoch": 5.653275425262396, + "grad_norm": 1.354953408241272, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7810 + }, + { + "epoch": 5.660513934129569, + "grad_norm": 0.8966761231422424, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7820 + }, + { + "epoch": 5.6677524429967425, + "grad_norm": 1.0675480365753174, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7830 + }, + { + "epoch": 5.674990951863916, + "grad_norm": 1.2104216814041138, + "learning_rate": 0.0002, + "loss": 1.1077, + "step": 7840 + }, + { + "epoch": 5.682229460731089, + "grad_norm": 1.105790376663208, + "learning_rate": 0.0002, + "loss": 0.9627, + "step": 7850 + }, + { + "epoch": 5.689467969598263, + "grad_norm": 1.0915391445159912, + "learning_rate": 0.0002, + "loss": 1.0483, + "step": 7860 + }, + { + "epoch": 5.696706478465436, + "grad_norm": 0.8957812786102295, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 7870 + }, + { + "epoch": 5.703944987332609, + "grad_norm": 1.9189311265945435, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 7880 + }, + { + "epoch": 5.711183496199783, + "grad_norm": 1.0867321491241455, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 7890 + }, + { + "epoch": 5.718422005066956, + "grad_norm": 1.0233147144317627, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 7900 + }, + { + "epoch": 5.7256605139341294, + "grad_norm": 1.16460382938385, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 7910 + }, + { + "epoch": 5.732899022801303, + "grad_norm": 1.1098358631134033, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 7920 + }, + { + "epoch": 5.740137531668476, + "grad_norm": 0.8555701375007629, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 7930 + }, + { + "epoch": 5.7473760405356495, + "grad_norm": 0.9885705709457397, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 7940 + }, + { + "epoch": 5.754614549402823, + "grad_norm": 0.9184203147888184, + "learning_rate": 0.0002, + "loss": 0.9909, + "step": 7950 + }, + { + "epoch": 5.761853058269996, + "grad_norm": 0.9653698205947876, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7960 + }, + { + "epoch": 5.76909156713717, + "grad_norm": 1.0014251470565796, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 7970 + }, + { + "epoch": 5.776330076004343, + "grad_norm": 1.004701018333435, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7980 + }, + { + "epoch": 5.783568584871516, + "grad_norm": 0.950577974319458, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7990 + }, + { + "epoch": 5.79080709373869, + "grad_norm": 1.2986834049224854, + "learning_rate": 0.0002, + "loss": 0.9725, + "step": 8000 + }, + { + "epoch": 5.798045602605863, + "grad_norm": 1.3353424072265625, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 8010 + }, + { + "epoch": 5.8052841114730365, + "grad_norm": 0.7650562524795532, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 8020 + }, + { + "epoch": 5.81252262034021, + "grad_norm": 1.0156235694885254, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 8030 + }, + { + "epoch": 5.819761129207383, + "grad_norm": 1.3092900514602661, + "learning_rate": 0.0002, + "loss": 1.0185, + "step": 8040 + }, + { + "epoch": 5.826999638074557, + "grad_norm": 1.184428095817566, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 8050 + }, + { + "epoch": 5.83423814694173, + "grad_norm": 0.979401707649231, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 8060 + }, + { + "epoch": 5.841476655808903, + "grad_norm": 1.3557400703430176, + "learning_rate": 0.0002, + "loss": 0.9721, + "step": 8070 + }, + { + "epoch": 5.848715164676077, + "grad_norm": 0.8429333567619324, + "learning_rate": 0.0002, + "loss": 1.0235, + "step": 8080 + }, + { + "epoch": 5.85595367354325, + "grad_norm": 1.3167692422866821, + "learning_rate": 0.0002, + "loss": 0.952, + "step": 8090 + }, + { + "epoch": 5.863192182410423, + "grad_norm": 0.9750998020172119, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 8100 + }, + { + "epoch": 5.870430691277597, + "grad_norm": 1.1869813203811646, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 8110 + }, + { + "epoch": 5.87766920014477, + "grad_norm": 1.508615255355835, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 8120 + }, + { + "epoch": 5.8849077090119435, + "grad_norm": 0.9439908266067505, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 8130 + }, + { + "epoch": 5.892146217879117, + "grad_norm": 0.910508930683136, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 8140 + }, + { + "epoch": 5.89938472674629, + "grad_norm": 1.111501932144165, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 8150 + }, + { + "epoch": 5.906623235613464, + "grad_norm": 0.726554274559021, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 8160 + }, + { + "epoch": 5.913861744480637, + "grad_norm": 1.1084556579589844, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 8170 + }, + { + "epoch": 5.92110025334781, + "grad_norm": 0.9695167541503906, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 8180 + }, + { + "epoch": 5.928338762214984, + "grad_norm": 1.1169592142105103, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 8190 + }, + { + "epoch": 5.935577271082157, + "grad_norm": 1.5116780996322632, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 8200 + }, + { + "epoch": 5.94281577994933, + "grad_norm": 1.0073388814926147, + "learning_rate": 0.0002, + "loss": 0.878, + "step": 8210 + }, + { + "epoch": 5.950054288816504, + "grad_norm": 0.9323263168334961, + "learning_rate": 0.0002, + "loss": 1.0462, + "step": 8220 + }, + { + "epoch": 5.957292797683677, + "grad_norm": 0.9422887563705444, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 8230 + }, + { + "epoch": 5.9645313065508505, + "grad_norm": 0.9691047668457031, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 8240 + }, + { + "epoch": 5.971769815418024, + "grad_norm": 0.9650622606277466, + "learning_rate": 0.0002, + "loss": 0.9842, + "step": 8250 + }, + { + "epoch": 5.979008324285197, + "grad_norm": 1.077958345413208, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 8260 + }, + { + "epoch": 5.986246833152371, + "grad_norm": 0.8946306109428406, + "learning_rate": 0.0002, + "loss": 0.9162, + "step": 8270 + }, + { + "epoch": 5.993485342019544, + "grad_norm": 1.34098219871521, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 8280 + }, + { + "epoch": 6.0, + "eval_loss": 1.4714229106903076, + "eval_runtime": 26.301, + "eval_samples_per_second": 16.577, + "eval_steps_per_second": 2.091, + "step": 8289 + }, + { + "epoch": 6.000723850886717, + "grad_norm": 0.9737564325332642, + "learning_rate": 0.0002, + "loss": 1.1403, + "step": 8290 + }, + { + "epoch": 6.007962359753891, + "grad_norm": 1.2205945253372192, + "learning_rate": 0.0002, + "loss": 0.8875, + "step": 8300 + }, + { + "epoch": 6.015200868621064, + "grad_norm": 1.3529434204101562, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 8310 + }, + { + "epoch": 6.022439377488237, + "grad_norm": 1.2300174236297607, + "learning_rate": 0.0002, + "loss": 0.9427, + "step": 8320 + }, + { + "epoch": 6.029677886355411, + "grad_norm": 0.9248194098472595, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 8330 + }, + { + "epoch": 6.036916395222584, + "grad_norm": 1.1140035390853882, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 8340 + }, + { + "epoch": 6.0441549040897575, + "grad_norm": 1.2097352743148804, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 8350 + }, + { + "epoch": 6.051393412956931, + "grad_norm": 0.9472483396530151, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 8360 + }, + { + "epoch": 6.058631921824104, + "grad_norm": 1.0195368528366089, + "learning_rate": 0.0002, + "loss": 0.8865, + "step": 8370 + }, + { + "epoch": 6.065870430691278, + "grad_norm": 1.182735562324524, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 8380 + }, + { + "epoch": 6.073108939558451, + "grad_norm": 1.1042858362197876, + "learning_rate": 0.0002, + "loss": 0.9455, + "step": 8390 + }, + { + "epoch": 6.080347448425624, + "grad_norm": 0.8606401085853577, + "learning_rate": 0.0002, + "loss": 0.9723, + "step": 8400 + }, + { + "epoch": 6.087585957292798, + "grad_norm": 1.1015676259994507, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 8410 + }, + { + "epoch": 6.094824466159971, + "grad_norm": 1.690224289894104, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 8420 + }, + { + "epoch": 6.1020629750271445, + "grad_norm": 1.1928749084472656, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 8430 + }, + { + "epoch": 6.109301483894318, + "grad_norm": 1.0816864967346191, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 8440 + }, + { + "epoch": 6.116539992761491, + "grad_norm": 1.1638226509094238, + "learning_rate": 0.0002, + "loss": 0.8286, + "step": 8450 + }, + { + "epoch": 6.1237785016286646, + "grad_norm": 1.3782968521118164, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 8460 + }, + { + "epoch": 6.131017010495838, + "grad_norm": 1.2030094861984253, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 8470 + }, + { + "epoch": 6.138255519363011, + "grad_norm": 1.3227659463882446, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 8480 + }, + { + "epoch": 6.145494028230185, + "grad_norm": 1.104384422302246, + "learning_rate": 0.0002, + "loss": 0.9175, + "step": 8490 + }, + { + "epoch": 6.152732537097358, + "grad_norm": 1.518805980682373, + "learning_rate": 0.0002, + "loss": 0.861, + "step": 8500 + }, + { + "epoch": 6.159971045964531, + "grad_norm": 1.2029093503952026, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 8510 + }, + { + "epoch": 6.167209554831705, + "grad_norm": 1.2991217374801636, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 8520 + }, + { + "epoch": 6.174448063698878, + "grad_norm": 1.7002956867218018, + "learning_rate": 0.0002, + "loss": 0.9748, + "step": 8530 + }, + { + "epoch": 6.1816865725660515, + "grad_norm": 1.6653581857681274, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 8540 + }, + { + "epoch": 6.188925081433225, + "grad_norm": 1.0493303537368774, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 8550 + }, + { + "epoch": 6.196163590300398, + "grad_norm": 1.539345622062683, + "learning_rate": 0.0002, + "loss": 0.8726, + "step": 8560 + }, + { + "epoch": 6.203402099167572, + "grad_norm": 1.2757070064544678, + "learning_rate": 0.0002, + "loss": 0.9452, + "step": 8570 + }, + { + "epoch": 6.210640608034745, + "grad_norm": 1.2416890859603882, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 8580 + }, + { + "epoch": 6.217879116901918, + "grad_norm": 1.617621898651123, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 8590 + }, + { + "epoch": 6.225117625769092, + "grad_norm": 1.058962106704712, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 8600 + }, + { + "epoch": 6.232356134636265, + "grad_norm": 1.1489088535308838, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 8610 + }, + { + "epoch": 6.239594643503438, + "grad_norm": 0.9391577243804932, + "learning_rate": 0.0002, + "loss": 0.9476, + "step": 8620 + }, + { + "epoch": 6.246833152370612, + "grad_norm": 1.363706111907959, + "learning_rate": 0.0002, + "loss": 0.932, + "step": 8630 + }, + { + "epoch": 6.254071661237785, + "grad_norm": 0.779502809047699, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 8640 + }, + { + "epoch": 6.2613101701049585, + "grad_norm": 2.000821590423584, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 8650 + }, + { + "epoch": 6.268548678972132, + "grad_norm": 1.1521023511886597, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 8660 + }, + { + "epoch": 6.275787187839305, + "grad_norm": 1.3734570741653442, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 8670 + }, + { + "epoch": 6.283025696706479, + "grad_norm": 0.9550670385360718, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8680 + }, + { + "epoch": 6.290264205573652, + "grad_norm": 0.8937032222747803, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 8690 + }, + { + "epoch": 6.297502714440825, + "grad_norm": 1.3352779150009155, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 8700 + }, + { + "epoch": 6.304741223307999, + "grad_norm": 1.3057222366333008, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 8710 + }, + { + "epoch": 6.311979732175172, + "grad_norm": 0.9078314304351807, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 8720 + }, + { + "epoch": 6.319218241042345, + "grad_norm": 1.6663457155227661, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 8730 + }, + { + "epoch": 6.326456749909519, + "grad_norm": 1.2043739557266235, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 8740 + }, + { + "epoch": 6.333695258776692, + "grad_norm": 0.9165967702865601, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 8750 + }, + { + "epoch": 6.3409337676438655, + "grad_norm": 1.016452670097351, + "learning_rate": 0.0002, + "loss": 0.9761, + "step": 8760 + }, + { + "epoch": 6.348172276511039, + "grad_norm": 1.2209261655807495, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 8770 + }, + { + "epoch": 6.355410785378212, + "grad_norm": 1.3380663394927979, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8780 + }, + { + "epoch": 6.362649294245386, + "grad_norm": 2.3311562538146973, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 8790 + }, + { + "epoch": 6.369887803112559, + "grad_norm": 1.0330604314804077, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 8800 + }, + { + "epoch": 6.377126311979732, + "grad_norm": 0.9655511975288391, + "learning_rate": 0.0002, + "loss": 0.98, + "step": 8810 + }, + { + "epoch": 6.384364820846906, + "grad_norm": 1.1065765619277954, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 8820 + }, + { + "epoch": 6.391603329714079, + "grad_norm": 1.2631285190582275, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 8830 + }, + { + "epoch": 6.398841838581252, + "grad_norm": 0.92459636926651, + "learning_rate": 0.0002, + "loss": 0.8989, + "step": 8840 + }, + { + "epoch": 6.406080347448426, + "grad_norm": 0.9982633590698242, + "learning_rate": 0.0002, + "loss": 0.8536, + "step": 8850 + }, + { + "epoch": 6.413318856315599, + "grad_norm": 1.0746768712997437, + "learning_rate": 0.0002, + "loss": 0.8949, + "step": 8860 + }, + { + "epoch": 6.4205573651827725, + "grad_norm": 1.3024073839187622, + "learning_rate": 0.0002, + "loss": 0.8547, + "step": 8870 + }, + { + "epoch": 6.427795874049946, + "grad_norm": 1.2764527797698975, + "learning_rate": 0.0002, + "loss": 0.9618, + "step": 8880 + }, + { + "epoch": 6.435034382917119, + "grad_norm": 0.8318809270858765, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 8890 + }, + { + "epoch": 6.442272891784293, + "grad_norm": 1.7350783348083496, + "learning_rate": 0.0002, + "loss": 0.917, + "step": 8900 + }, + { + "epoch": 6.449511400651466, + "grad_norm": 1.3430488109588623, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 8910 + }, + { + "epoch": 6.456749909518639, + "grad_norm": 1.5907495021820068, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8920 + }, + { + "epoch": 6.463988418385813, + "grad_norm": 1.8579202890396118, + "learning_rate": 0.0002, + "loss": 0.9639, + "step": 8930 + }, + { + "epoch": 6.471226927252986, + "grad_norm": 1.2233413457870483, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 8940 + }, + { + "epoch": 6.4784654361201595, + "grad_norm": 1.009103775024414, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 8950 + }, + { + "epoch": 6.485703944987333, + "grad_norm": 1.1265181303024292, + "learning_rate": 0.0002, + "loss": 0.8969, + "step": 8960 + }, + { + "epoch": 6.492942453854506, + "grad_norm": 1.1733338832855225, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 8970 + }, + { + "epoch": 6.50018096272168, + "grad_norm": 1.0444518327713013, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 8980 + }, + { + "epoch": 6.507419471588853, + "grad_norm": 1.2296479940414429, + "learning_rate": 0.0002, + "loss": 0.9582, + "step": 8990 + }, + { + "epoch": 6.514657980456026, + "grad_norm": 1.370417833328247, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 9000 + }, + { + "epoch": 6.5218964893232, + "grad_norm": 1.4787620306015015, + "learning_rate": 0.0002, + "loss": 0.9787, + "step": 9010 + }, + { + "epoch": 6.529134998190373, + "grad_norm": 0.8550514578819275, + "learning_rate": 0.0002, + "loss": 0.967, + "step": 9020 + }, + { + "epoch": 6.536373507057546, + "grad_norm": 1.2327991724014282, + "learning_rate": 0.0002, + "loss": 0.9755, + "step": 9030 + }, + { + "epoch": 6.54361201592472, + "grad_norm": 1.0915621519088745, + "learning_rate": 0.0002, + "loss": 0.9248, + "step": 9040 + }, + { + "epoch": 6.550850524791893, + "grad_norm": 1.7243309020996094, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 9050 + }, + { + "epoch": 6.5580890336590665, + "grad_norm": 0.954359769821167, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 9060 + }, + { + "epoch": 6.56532754252624, + "grad_norm": 1.066051959991455, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 9070 + }, + { + "epoch": 6.572566051393413, + "grad_norm": 1.200271487236023, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 9080 + }, + { + "epoch": 6.579804560260587, + "grad_norm": 1.4331457614898682, + "learning_rate": 0.0002, + "loss": 0.9788, + "step": 9090 + }, + { + "epoch": 6.58704306912776, + "grad_norm": 1.0892444849014282, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 9100 + }, + { + "epoch": 6.594281577994933, + "grad_norm": 1.849726915359497, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 9110 + }, + { + "epoch": 6.601520086862107, + "grad_norm": 1.1228708028793335, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 9120 + }, + { + "epoch": 6.60875859572928, + "grad_norm": 1.0928595066070557, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 9130 + }, + { + "epoch": 6.615997104596453, + "grad_norm": 1.2138155698776245, + "learning_rate": 0.0002, + "loss": 0.9342, + "step": 9140 + }, + { + "epoch": 6.623235613463627, + "grad_norm": 1.5155235528945923, + "learning_rate": 0.0002, + "loss": 0.8715, + "step": 9150 + }, + { + "epoch": 6.6304741223308, + "grad_norm": 1.3194212913513184, + "learning_rate": 0.0002, + "loss": 0.9806, + "step": 9160 + }, + { + "epoch": 6.6377126311979735, + "grad_norm": 1.045623779296875, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 9170 + }, + { + "epoch": 6.644951140065147, + "grad_norm": 0.9647570252418518, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 9180 + }, + { + "epoch": 6.65218964893232, + "grad_norm": 1.0818220376968384, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 9190 + }, + { + "epoch": 6.659428157799494, + "grad_norm": 1.2792822122573853, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 9200 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.2764191627502441, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 9210 + }, + { + "epoch": 6.67390517553384, + "grad_norm": 1.0552066564559937, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 9220 + }, + { + "epoch": 6.681143684401014, + "grad_norm": 1.082476019859314, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 9230 + }, + { + "epoch": 6.688382193268187, + "grad_norm": 1.3313323259353638, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 9240 + }, + { + "epoch": 6.69562070213536, + "grad_norm": 1.130048394203186, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 9250 + }, + { + "epoch": 6.702859211002534, + "grad_norm": 1.1997296810150146, + "learning_rate": 0.0002, + "loss": 0.9969, + "step": 9260 + }, + { + "epoch": 6.710097719869707, + "grad_norm": 1.0591834783554077, + "learning_rate": 0.0002, + "loss": 0.8691, + "step": 9270 + }, + { + "epoch": 6.7173362287368805, + "grad_norm": 1.2722901105880737, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 9280 + }, + { + "epoch": 6.724574737604054, + "grad_norm": 1.1150950193405151, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 9290 + }, + { + "epoch": 6.731813246471227, + "grad_norm": 1.1575992107391357, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 9300 + }, + { + "epoch": 6.739051755338401, + "grad_norm": 0.9371691346168518, + "learning_rate": 0.0002, + "loss": 0.9822, + "step": 9310 + }, + { + "epoch": 6.746290264205574, + "grad_norm": 1.4924226999282837, + "learning_rate": 0.0002, + "loss": 0.9773, + "step": 9320 + }, + { + "epoch": 6.753528773072747, + "grad_norm": 1.1524218320846558, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 9330 + }, + { + "epoch": 6.760767281939921, + "grad_norm": 0.9500471949577332, + "learning_rate": 0.0002, + "loss": 0.9271, + "step": 9340 + }, + { + "epoch": 6.768005790807094, + "grad_norm": 1.2062290906906128, + "learning_rate": 0.0002, + "loss": 0.9029, + "step": 9350 + }, + { + "epoch": 6.7752442996742674, + "grad_norm": 1.212631106376648, + "learning_rate": 0.0002, + "loss": 0.9121, + "step": 9360 + }, + { + "epoch": 6.782482808541441, + "grad_norm": 1.9135472774505615, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 9370 + }, + { + "epoch": 6.789721317408614, + "grad_norm": 0.9682775139808655, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 9380 + }, + { + "epoch": 6.7969598262757875, + "grad_norm": 1.1405237913131714, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 9390 + }, + { + "epoch": 6.804198335142961, + "grad_norm": 1.6855751276016235, + "learning_rate": 0.0002, + "loss": 0.8922, + "step": 9400 + }, + { + "epoch": 6.811436844010134, + "grad_norm": 1.6590169668197632, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 9410 + }, + { + "epoch": 6.818675352877308, + "grad_norm": 1.8795170783996582, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 9420 + }, + { + "epoch": 6.825913861744481, + "grad_norm": 1.1087183952331543, + "learning_rate": 0.0002, + "loss": 0.9142, + "step": 9430 + }, + { + "epoch": 6.833152370611654, + "grad_norm": 1.4178446531295776, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 9440 + }, + { + "epoch": 6.840390879478828, + "grad_norm": 1.0792350769042969, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 9450 + }, + { + "epoch": 6.847629388346001, + "grad_norm": 1.2159196138381958, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 9460 + }, + { + "epoch": 6.8548678972131745, + "grad_norm": 0.9998821020126343, + "learning_rate": 0.0002, + "loss": 0.9536, + "step": 9470 + }, + { + "epoch": 6.862106406080348, + "grad_norm": 0.7940687537193298, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 9480 + }, + { + "epoch": 6.869344914947521, + "grad_norm": 0.9572826027870178, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 9490 + }, + { + "epoch": 6.876583423814694, + "grad_norm": 1.1086537837982178, + "learning_rate": 0.0002, + "loss": 0.9611, + "step": 9500 + }, + { + "epoch": 6.883821932681867, + "grad_norm": 1.1934887170791626, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 9510 + }, + { + "epoch": 6.89106044154904, + "grad_norm": 1.207324504852295, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 9520 + }, + { + "epoch": 6.898298950416214, + "grad_norm": 1.1303677558898926, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 9530 + }, + { + "epoch": 6.905537459283387, + "grad_norm": 1.4958926439285278, + "learning_rate": 0.0002, + "loss": 0.9599, + "step": 9540 + }, + { + "epoch": 6.9127759681505605, + "grad_norm": 1.2141553163528442, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 9550 + }, + { + "epoch": 6.920014477017734, + "grad_norm": 1.6544346809387207, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 9560 + }, + { + "epoch": 6.927252985884907, + "grad_norm": 1.0540320873260498, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 9570 + }, + { + "epoch": 6.934491494752081, + "grad_norm": 1.3095581531524658, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 9580 + }, + { + "epoch": 6.941730003619254, + "grad_norm": 1.4509341716766357, + "learning_rate": 0.0002, + "loss": 0.8694, + "step": 9590 + }, + { + "epoch": 6.948968512486427, + "grad_norm": 1.1091740131378174, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 9600 + }, + { + "epoch": 6.956207021353601, + "grad_norm": 1.102929949760437, + "learning_rate": 0.0002, + "loss": 0.9126, + "step": 9610 + }, + { + "epoch": 6.963445530220774, + "grad_norm": 1.1377743482589722, + "learning_rate": 0.0002, + "loss": 0.9622, + "step": 9620 + }, + { + "epoch": 6.970684039087947, + "grad_norm": 1.2070361375808716, + "learning_rate": 0.0002, + "loss": 0.9045, + "step": 9630 + }, + { + "epoch": 6.977922547955121, + "grad_norm": 1.30153489112854, + "learning_rate": 0.0002, + "loss": 0.9714, + "step": 9640 + }, + { + "epoch": 6.985161056822294, + "grad_norm": 1.4641543626785278, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 9650 + }, + { + "epoch": 6.9923995656894675, + "grad_norm": 1.0497819185256958, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 9660 + }, + { + "epoch": 6.999638074556641, + "grad_norm": 1.2500354051589966, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 9670 + }, + { + "epoch": 6.999638074556641, + "eval_loss": 1.518465518951416, + "eval_runtime": 26.4525, + "eval_samples_per_second": 16.482, + "eval_steps_per_second": 2.079, + "step": 9670 + }, + { + "epoch": 7.006876583423814, + "grad_norm": 1.0240943431854248, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 9680 + }, + { + "epoch": 7.014115092290988, + "grad_norm": 1.2250308990478516, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 9690 + }, + { + "epoch": 7.021353601158161, + "grad_norm": 1.397510290145874, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 9700 + }, + { + "epoch": 7.028592110025334, + "grad_norm": 1.9754822254180908, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 9710 + }, + { + "epoch": 7.035830618892508, + "grad_norm": 1.7932360172271729, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 9720 + }, + { + "epoch": 7.043069127759681, + "grad_norm": 0.8552590608596802, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 9730 + }, + { + "epoch": 7.0503076366268544, + "grad_norm": 1.758886694908142, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 9740 + }, + { + "epoch": 7.057546145494028, + "grad_norm": 1.5239284038543701, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 9750 + }, + { + "epoch": 7.064784654361201, + "grad_norm": 1.2506821155548096, + "learning_rate": 0.0002, + "loss": 0.9078, + "step": 9760 + }, + { + "epoch": 7.0720231632283745, + "grad_norm": 1.274202823638916, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 9770 + }, + { + "epoch": 7.079261672095548, + "grad_norm": 1.296419620513916, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 9780 + }, + { + "epoch": 7.086500180962721, + "grad_norm": 1.3418468236923218, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 9790 + }, + { + "epoch": 7.093738689829895, + "grad_norm": 1.1935685873031616, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 9800 + }, + { + "epoch": 7.100977198697068, + "grad_norm": 1.2341830730438232, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 9810 + }, + { + "epoch": 7.108215707564241, + "grad_norm": 1.3398581743240356, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 9820 + }, + { + "epoch": 7.115454216431415, + "grad_norm": 1.1919665336608887, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 9830 + }, + { + "epoch": 7.122692725298588, + "grad_norm": 0.9331274032592773, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9840 + }, + { + "epoch": 7.1299312341657615, + "grad_norm": 1.0933221578598022, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 9850 + }, + { + "epoch": 7.137169743032935, + "grad_norm": 1.0350896120071411, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 9860 + }, + { + "epoch": 7.144408251900108, + "grad_norm": 1.334342360496521, + "learning_rate": 0.0002, + "loss": 0.903, + "step": 9870 + }, + { + "epoch": 7.151646760767282, + "grad_norm": 1.5271754264831543, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 9880 + }, + { + "epoch": 7.158885269634455, + "grad_norm": 1.0453001260757446, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 9890 + }, + { + "epoch": 7.166123778501628, + "grad_norm": 1.204174518585205, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 9900 + }, + { + "epoch": 7.173362287368802, + "grad_norm": 1.0774344205856323, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 9910 + }, + { + "epoch": 7.180600796235975, + "grad_norm": 1.282188892364502, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 9920 + }, + { + "epoch": 7.187839305103148, + "grad_norm": 1.1413695812225342, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9930 + }, + { + "epoch": 7.195077813970322, + "grad_norm": 1.2970763444900513, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9940 + }, + { + "epoch": 7.202316322837495, + "grad_norm": 1.2535417079925537, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 9950 + }, + { + "epoch": 7.2095548317046685, + "grad_norm": 1.3520581722259521, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 9960 + }, + { + "epoch": 7.216793340571842, + "grad_norm": 1.288572072982788, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 9970 + }, + { + "epoch": 7.224031849439015, + "grad_norm": 1.4298021793365479, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 9980 + }, + { + "epoch": 7.231270358306189, + "grad_norm": 1.4797194004058838, + "learning_rate": 0.0002, + "loss": 0.9437, + "step": 9990 + }, + { + "epoch": 7.238508867173362, + "grad_norm": 1.5020978450775146, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 10000 + }, + { + "epoch": 7.245747376040535, + "grad_norm": 1.1417840719223022, + "learning_rate": 0.0002, + "loss": 0.8584, + "step": 10010 + }, + { + "epoch": 7.252985884907709, + "grad_norm": 1.746782898902893, + "learning_rate": 0.0002, + "loss": 0.8547, + "step": 10020 + }, + { + "epoch": 7.260224393774882, + "grad_norm": 1.019222617149353, + "learning_rate": 0.0002, + "loss": 0.8721, + "step": 10030 + }, + { + "epoch": 7.267462902642055, + "grad_norm": 1.3712849617004395, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 10040 + }, + { + "epoch": 7.274701411509229, + "grad_norm": 1.5264288187026978, + "learning_rate": 0.0002, + "loss": 0.9228, + "step": 10050 + }, + { + "epoch": 7.281939920376402, + "grad_norm": 1.2784953117370605, + "learning_rate": 0.0002, + "loss": 0.8803, + "step": 10060 + }, + { + "epoch": 7.2891784292435755, + "grad_norm": 1.0246731042861938, + "learning_rate": 0.0002, + "loss": 0.9144, + "step": 10070 + }, + { + "epoch": 7.296416938110749, + "grad_norm": 1.2060108184814453, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10080 + }, + { + "epoch": 7.303655446977922, + "grad_norm": 1.0908410549163818, + "learning_rate": 0.0002, + "loss": 0.8715, + "step": 10090 + }, + { + "epoch": 7.310893955845096, + "grad_norm": 1.2308661937713623, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 10100 + }, + { + "epoch": 7.318132464712269, + "grad_norm": 1.185610055923462, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 10110 + }, + { + "epoch": 7.325370973579442, + "grad_norm": 1.0026527643203735, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 10120 + }, + { + "epoch": 7.332609482446616, + "grad_norm": 1.3346470594406128, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 10130 + }, + { + "epoch": 7.339847991313789, + "grad_norm": 1.5946321487426758, + "learning_rate": 0.0002, + "loss": 0.975, + "step": 10140 + }, + { + "epoch": 7.347086500180962, + "grad_norm": 1.3622175455093384, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 10150 + }, + { + "epoch": 7.354325009048136, + "grad_norm": 1.0937085151672363, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 10160 + }, + { + "epoch": 7.361563517915309, + "grad_norm": 1.6057474613189697, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 10170 + }, + { + "epoch": 7.3688020267824825, + "grad_norm": 1.234887719154358, + "learning_rate": 0.0002, + "loss": 0.8345, + "step": 10180 + }, + { + "epoch": 7.376040535649656, + "grad_norm": 1.2238616943359375, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 10190 + }, + { + "epoch": 7.383279044516829, + "grad_norm": 1.2640055418014526, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 10200 + }, + { + "epoch": 7.390517553384003, + "grad_norm": 1.2917805910110474, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 10210 + }, + { + "epoch": 7.397756062251176, + "grad_norm": 1.096583366394043, + "learning_rate": 0.0002, + "loss": 0.8748, + "step": 10220 + }, + { + "epoch": 7.404994571118349, + "grad_norm": 1.1854201555252075, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 10230 + }, + { + "epoch": 7.412233079985523, + "grad_norm": 1.2318766117095947, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 10240 + }, + { + "epoch": 7.419471588852696, + "grad_norm": 1.395302414894104, + "learning_rate": 0.0002, + "loss": 0.9144, + "step": 10250 + }, + { + "epoch": 7.4267100977198695, + "grad_norm": 1.118148922920227, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10260 + }, + { + "epoch": 7.433948606587043, + "grad_norm": 1.1969468593597412, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 10270 + }, + { + "epoch": 7.441187115454216, + "grad_norm": 1.434050440788269, + "learning_rate": 0.0002, + "loss": 0.8955, + "step": 10280 + }, + { + "epoch": 7.4484256243213895, + "grad_norm": 1.2344770431518555, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 10290 + }, + { + "epoch": 7.455664133188563, + "grad_norm": 1.2186434268951416, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 10300 + }, + { + "epoch": 7.462902642055736, + "grad_norm": 1.482475757598877, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 10310 + }, + { + "epoch": 7.47014115092291, + "grad_norm": 1.8391777276992798, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 10320 + }, + { + "epoch": 7.477379659790083, + "grad_norm": 1.9489128589630127, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 10330 + }, + { + "epoch": 7.484618168657256, + "grad_norm": 1.369743824005127, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 10340 + }, + { + "epoch": 7.49185667752443, + "grad_norm": 1.3188602924346924, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 10350 + }, + { + "epoch": 7.499095186391603, + "grad_norm": 1.1885292530059814, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 10360 + }, + { + "epoch": 7.5063336952587765, + "grad_norm": 1.4873403310775757, + "learning_rate": 0.0002, + "loss": 0.8832, + "step": 10370 + }, + { + "epoch": 7.51357220412595, + "grad_norm": 1.8681598901748657, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 10380 + }, + { + "epoch": 7.520810712993123, + "grad_norm": 1.398186445236206, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 10390 + }, + { + "epoch": 7.528049221860297, + "grad_norm": 1.272192358970642, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 10400 + }, + { + "epoch": 7.53528773072747, + "grad_norm": 0.9671797752380371, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 10410 + }, + { + "epoch": 7.542526239594643, + "grad_norm": 0.9752382040023804, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 10420 + }, + { + "epoch": 7.549764748461817, + "grad_norm": 1.2241966724395752, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 10430 + }, + { + "epoch": 7.55700325732899, + "grad_norm": 1.4615166187286377, + "learning_rate": 0.0002, + "loss": 0.8497, + "step": 10440 + }, + { + "epoch": 7.564241766196163, + "grad_norm": 1.123205542564392, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 10450 + }, + { + "epoch": 7.571480275063337, + "grad_norm": 1.3798918724060059, + "learning_rate": 0.0002, + "loss": 0.893, + "step": 10460 + }, + { + "epoch": 7.57871878393051, + "grad_norm": 1.3772553205490112, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 10470 + }, + { + "epoch": 7.5859572927976835, + "grad_norm": 1.4591912031173706, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 10480 + }, + { + "epoch": 7.593195801664857, + "grad_norm": 1.4248491525650024, + "learning_rate": 0.0002, + "loss": 0.878, + "step": 10490 + }, + { + "epoch": 7.60043431053203, + "grad_norm": 1.2663065195083618, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10500 + }, + { + "epoch": 7.607672819399204, + "grad_norm": 1.1095938682556152, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 10510 + }, + { + "epoch": 7.614911328266377, + "grad_norm": 1.8462796211242676, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10520 + }, + { + "epoch": 7.62214983713355, + "grad_norm": 1.1936118602752686, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 10530 + }, + { + "epoch": 7.629388346000724, + "grad_norm": 1.3520885705947876, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 10540 + }, + { + "epoch": 7.636626854867897, + "grad_norm": 1.2915338277816772, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 10550 + }, + { + "epoch": 7.64386536373507, + "grad_norm": 1.125656008720398, + "learning_rate": 0.0002, + "loss": 0.8932, + "step": 10560 + }, + { + "epoch": 7.651103872602244, + "grad_norm": 1.419791579246521, + "learning_rate": 0.0002, + "loss": 0.8689, + "step": 10570 + }, + { + "epoch": 7.658342381469417, + "grad_norm": 1.2106866836547852, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 10580 + }, + { + "epoch": 7.6655808903365905, + "grad_norm": 1.359818458557129, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 10590 + }, + { + "epoch": 7.672819399203764, + "grad_norm": 1.3971713781356812, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 10600 + }, + { + "epoch": 7.680057908070937, + "grad_norm": 1.287888765335083, + "learning_rate": 0.0002, + "loss": 0.942, + "step": 10610 + }, + { + "epoch": 7.687296416938111, + "grad_norm": 0.9856569766998291, + "learning_rate": 0.0002, + "loss": 0.8835, + "step": 10620 + }, + { + "epoch": 7.694534925805284, + "grad_norm": 1.5403797626495361, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 10630 + }, + { + "epoch": 7.701773434672457, + "grad_norm": 1.204551339149475, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 10640 + }, + { + "epoch": 7.709011943539631, + "grad_norm": 1.3801014423370361, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10650 + }, + { + "epoch": 7.716250452406804, + "grad_norm": 1.3335949182510376, + "learning_rate": 0.0002, + "loss": 0.8715, + "step": 10660 + }, + { + "epoch": 7.723488961273977, + "grad_norm": 1.1740102767944336, + "learning_rate": 0.0002, + "loss": 0.9412, + "step": 10670 + }, + { + "epoch": 7.730727470141151, + "grad_norm": 1.1663082838058472, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 10680 + }, + { + "epoch": 7.737965979008324, + "grad_norm": 1.3149393796920776, + "learning_rate": 0.0002, + "loss": 0.9191, + "step": 10690 + }, + { + "epoch": 7.7452044878754975, + "grad_norm": 1.3169108629226685, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 10700 + }, + { + "epoch": 7.752442996742671, + "grad_norm": 1.4583594799041748, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 10710 + }, + { + "epoch": 7.759681505609844, + "grad_norm": 1.1077126264572144, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 10720 + }, + { + "epoch": 7.766920014477018, + "grad_norm": 1.5475820302963257, + "learning_rate": 0.0002, + "loss": 0.9109, + "step": 10730 + }, + { + "epoch": 7.774158523344191, + "grad_norm": 1.2319282293319702, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 10740 + }, + { + "epoch": 7.781397032211364, + "grad_norm": 0.9938047528266907, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 10750 + }, + { + "epoch": 7.788635541078538, + "grad_norm": 1.2498962879180908, + "learning_rate": 0.0002, + "loss": 0.9356, + "step": 10760 + }, + { + "epoch": 7.795874049945711, + "grad_norm": 2.192695379257202, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 10770 + }, + { + "epoch": 7.8031125588128845, + "grad_norm": 1.1851826906204224, + "learning_rate": 0.0002, + "loss": 0.9072, + "step": 10780 + }, + { + "epoch": 7.810351067680058, + "grad_norm": 1.0591034889221191, + "learning_rate": 0.0002, + "loss": 0.8933, + "step": 10790 + }, + { + "epoch": 7.817589576547231, + "grad_norm": 0.9350354671478271, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 10800 + }, + { + "epoch": 7.8248280854144046, + "grad_norm": 1.5080015659332275, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 10810 + }, + { + "epoch": 7.832066594281578, + "grad_norm": 2.136425495147705, + "learning_rate": 0.0002, + "loss": 0.9315, + "step": 10820 + }, + { + "epoch": 7.839305103148751, + "grad_norm": 1.5646673440933228, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 10830 + }, + { + "epoch": 7.846543612015925, + "grad_norm": 1.381301999092102, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 10840 + }, + { + "epoch": 7.853782120883098, + "grad_norm": 1.9323210716247559, + "learning_rate": 0.0002, + "loss": 0.9088, + "step": 10850 + }, + { + "epoch": 7.861020629750271, + "grad_norm": 1.020809531211853, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 10860 + }, + { + "epoch": 7.868259138617445, + "grad_norm": 1.1488909721374512, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 10870 + }, + { + "epoch": 7.875497647484618, + "grad_norm": 1.4068763256072998, + "learning_rate": 0.0002, + "loss": 0.883, + "step": 10880 + }, + { + "epoch": 7.8827361563517915, + "grad_norm": 0.9201020002365112, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 10890 + }, + { + "epoch": 7.889974665218965, + "grad_norm": 1.3163132667541504, + "learning_rate": 0.0002, + "loss": 0.8358, + "step": 10900 + }, + { + "epoch": 7.897213174086138, + "grad_norm": 1.65055513381958, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 10910 + }, + { + "epoch": 7.904451682953312, + "grad_norm": 1.1068748235702515, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 10920 + }, + { + "epoch": 7.911690191820485, + "grad_norm": 1.8744254112243652, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 10930 + }, + { + "epoch": 7.918928700687658, + "grad_norm": 1.3279157876968384, + "learning_rate": 0.0002, + "loss": 0.8969, + "step": 10940 + }, + { + "epoch": 7.926167209554832, + "grad_norm": 1.0890769958496094, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 10950 + }, + { + "epoch": 7.933405718422005, + "grad_norm": 1.3951836824417114, + "learning_rate": 0.0002, + "loss": 0.8742, + "step": 10960 + }, + { + "epoch": 7.940644227289178, + "grad_norm": 1.2761356830596924, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 10970 + }, + { + "epoch": 7.947882736156352, + "grad_norm": 1.2073882818222046, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10980 + }, + { + "epoch": 7.955121245023525, + "grad_norm": 1.1899374723434448, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 10990 + }, + { + "epoch": 7.9623597538906985, + "grad_norm": 1.3041194677352905, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 11000 + }, + { + "epoch": 7.969598262757872, + "grad_norm": 1.3564491271972656, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 11010 + }, + { + "epoch": 7.976836771625045, + "grad_norm": 1.1411082744598389, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 11020 + }, + { + "epoch": 7.984075280492219, + "grad_norm": 1.1378493309020996, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11030 + }, + { + "epoch": 7.991313789359392, + "grad_norm": 1.5072855949401855, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 11040 + }, + { + "epoch": 7.997104596453131, + "eval_loss": 1.5476553440093994, + "eval_runtime": 26.3225, + "eval_samples_per_second": 16.564, + "eval_steps_per_second": 2.089, + "step": 11048 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.35028615510229e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-11048/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3892c93516f4e1f5762f7c87679d013348b5051f --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c71d459969d8bed507ce9131e6643b65ab1d862f073a373123b5b5b6fb91eb26 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8205864177320ea73a81c1d8515f6139843e2e0d --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f78e3a9f40d3b81ab4f3ca6497abb80b966ab129bf8b0e5327f7142e8c0d421 +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..81321b6bce511583b7000f2a0b7a954ce14c7d68 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e317f41062559ca5eb1af54e847c5d7cf85da67608a9312cf4265eb0c1c2611 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..93d93154c7de3957b2ca32bd9583ae698db6750f --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568cc02dab4cab04c964cedf1df761956570e2e74c904d82b8bb30340ddb6861 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..96436c688664ec881c52140e90adcc41b536d363 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/trainer_state.json @@ -0,0 +1,1007 @@ +{ + "best_metric": 1.480796456336975, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381", + "epoch": 0.9996380745566413, + "eval_steps": 10, + "global_step": 1381, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6884687936946176e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8e0a88f4ddbdcab4a213de1d48b25947c90523fe --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9da8a367d56151653f5ff9143fe9d10f3ec83a5a6610e31d53748ea75673f4b +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..32b375f4a1edfa10ae8c3dc8083d8bc0a68b984c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73a5b960c502949a09578886e280eade3328143ddecbd6edc6542b05162277f +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2dd55fc461ae4f6d9ef68bed1742e771b59bb42b --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6acfc6453c61add2025a81b7aeb4cca24d4c821e87be1d086631674d7f9c4a33 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7709aa944d4094e1f170ebf504e64d0bf6ce6fd9 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfbb5e3cc39c3fb3bb05a3dbc06549de20a41e5f1432694f538f56fe06ec0d46 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8aa433345b63380da5893753c4fc4b9a2f7dead2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/trainer_state.json @@ -0,0 +1,1981 @@ +{ + "best_metric": 1.4366681575775146, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2763, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.376937587389235e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c79d70858ce055c6f1bb883cb35180f5113e2f20 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd71ca55a2af1b8facc7c698205b6599fe0e892ac0b224b8ae93a917b4ed891 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fefe026d77588a2fb04504506ae4b453fbff41e5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e6774e4d3087b746b73c316efd2376e80be183d49d96bbe52e95af6194cfc6b +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf374c934593037876e73e7d373f33b62283978a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fc5b5b8871dcac290402ff2806fc4f44c48ede96564614f00630c0c35f35799 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f4c2699ed751970f04911bcaaac08b3a60cf292 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bee20d46fc8ef8a7ae4eaca3f0c175688567933abd0069997d944e0d19b0cff +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3ab605fc65ec684264e9e3fdf98ee1ec92b78a8a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/trainer_state.json @@ -0,0 +1,2955 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 2.9996380745566413, + "eval_steps": 10, + "global_step": 4144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.065406381083853e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..655eefd2eea5a13040896f1cd416fe42493622b4 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b98143cbd60c831e49f04f9f4ba0f3bbda73012695c7ecf5743a71cd76cf6e8 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..072c0e2957726c5cc0328345945224af09bd3af6 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5bcbebb674c668272572527b65c1fe1f6778897cf32e232c0882f74c57876e +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcf8222e07bdd4ee70690af4b5eb95ac29ec2451 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b535b07a56c32c1db908d2c4e8439bc752debb1e73d647fe63d6867f729e8329 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c966bca080cbb78186c7353ae4cb971df996eed --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4473c852e8ab8679db533b88b697f77385cc25d705112cb6582681e17a7e16da +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4e41babb7ba651c8dc0ba6615b5de4aed6a5c51 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/trainer_state.json @@ -0,0 +1,3929 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5526, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + }, + { + "epoch": 3.0039811798769454, + "grad_norm": 0.7043836116790771, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 4150 + }, + { + "epoch": 3.0112196887441187, + "grad_norm": 0.6806283593177795, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4160 + }, + { + "epoch": 3.018458197611292, + "grad_norm": 0.7684550285339355, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 4170 + }, + { + "epoch": 3.0256967064784654, + "grad_norm": 0.7895237803459167, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4180 + }, + { + "epoch": 3.032935215345639, + "grad_norm": 0.7464531064033508, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 4190 + }, + { + "epoch": 3.040173724212812, + "grad_norm": 0.9358500838279724, + "learning_rate": 0.0002, + "loss": 1.1614, + "step": 4200 + }, + { + "epoch": 3.0474122330799855, + "grad_norm": 1.1066628694534302, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 4210 + }, + { + "epoch": 3.054650741947159, + "grad_norm": 0.6663267612457275, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 4220 + }, + { + "epoch": 3.0618892508143323, + "grad_norm": 0.6669464707374573, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 4230 + }, + { + "epoch": 3.0691277596815056, + "grad_norm": 0.7052164077758789, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 4240 + }, + { + "epoch": 3.076366268548679, + "grad_norm": 0.6118432879447937, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4250 + }, + { + "epoch": 3.0836047774158524, + "grad_norm": 0.6915903687477112, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 4260 + }, + { + "epoch": 3.0908432862830257, + "grad_norm": 0.7441644668579102, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4270 + }, + { + "epoch": 3.098081795150199, + "grad_norm": 0.823850691318512, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4280 + }, + { + "epoch": 3.1053203040173725, + "grad_norm": 0.9677883386611938, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 4290 + }, + { + "epoch": 3.112558812884546, + "grad_norm": 0.7002579569816589, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 4300 + }, + { + "epoch": 3.119797321751719, + "grad_norm": 0.778789758682251, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 4310 + }, + { + "epoch": 3.1270358306188926, + "grad_norm": 0.7236007452011108, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 4320 + }, + { + "epoch": 3.134274339486066, + "grad_norm": 0.8809133768081665, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 4330 + }, + { + "epoch": 3.1415128483532393, + "grad_norm": 0.7924913167953491, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 3.1487513572204127, + "grad_norm": 0.7437422275543213, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4350 + }, + { + "epoch": 3.155989866087586, + "grad_norm": 0.6428450345993042, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 4360 + }, + { + "epoch": 3.1632283749547594, + "grad_norm": 0.7922873497009277, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 4370 + }, + { + "epoch": 3.1704668838219328, + "grad_norm": 0.5252506732940674, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 4380 + }, + { + "epoch": 3.177705392689106, + "grad_norm": 0.8570457696914673, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 4390 + }, + { + "epoch": 3.1849439015562795, + "grad_norm": 0.7218987345695496, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 4400 + }, + { + "epoch": 3.192182410423453, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4410 + }, + { + "epoch": 3.199420919290626, + "grad_norm": 0.7386137843132019, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 4420 + }, + { + "epoch": 3.2066594281577996, + "grad_norm": 0.6227759122848511, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 4430 + }, + { + "epoch": 3.213897937024973, + "grad_norm": 0.7180278897285461, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 4440 + }, + { + "epoch": 3.2211364458921463, + "grad_norm": 0.745830774307251, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 4450 + }, + { + "epoch": 3.2283749547593197, + "grad_norm": 0.6766072511672974, + "learning_rate": 0.0002, + "loss": 1.234, + "step": 4460 + }, + { + "epoch": 3.235613463626493, + "grad_norm": 0.8325067162513733, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 4470 + }, + { + "epoch": 3.2428519724936664, + "grad_norm": 0.7148305177688599, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 4480 + }, + { + "epoch": 3.25009048136084, + "grad_norm": 0.7752676010131836, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 4490 + }, + { + "epoch": 3.257328990228013, + "grad_norm": 0.6776860952377319, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4500 + }, + { + "epoch": 3.2645674990951865, + "grad_norm": 0.704359769821167, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4510 + }, + { + "epoch": 3.27180600796236, + "grad_norm": 0.6880282163619995, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 4520 + }, + { + "epoch": 3.2790445168295332, + "grad_norm": 0.8179270029067993, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 4530 + }, + { + "epoch": 3.2862830256967066, + "grad_norm": 0.6718448996543884, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 4540 + }, + { + "epoch": 3.29352153456388, + "grad_norm": 0.8300657868385315, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4550 + }, + { + "epoch": 3.3007600434310533, + "grad_norm": 0.6433690786361694, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 4560 + }, + { + "epoch": 3.3079985522982267, + "grad_norm": 0.690262496471405, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4570 + }, + { + "epoch": 3.3152370611654, + "grad_norm": 0.7022852301597595, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 4580 + }, + { + "epoch": 3.3224755700325734, + "grad_norm": 0.6438387632369995, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 4590 + }, + { + "epoch": 3.329714078899747, + "grad_norm": 0.6866899132728577, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 4600 + }, + { + "epoch": 3.33695258776692, + "grad_norm": 0.8233968019485474, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 4610 + }, + { + "epoch": 3.3441910966340935, + "grad_norm": 0.7251574993133545, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 4620 + }, + { + "epoch": 3.351429605501267, + "grad_norm": 0.7855110168457031, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4630 + }, + { + "epoch": 3.3586681143684403, + "grad_norm": 0.8487356305122375, + "learning_rate": 0.0002, + "loss": 1.2922, + "step": 4640 + }, + { + "epoch": 3.3659066232356136, + "grad_norm": 0.6429011225700378, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 4650 + }, + { + "epoch": 3.373145132102787, + "grad_norm": 0.7095270156860352, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4660 + }, + { + "epoch": 3.3803836409699604, + "grad_norm": 0.6792303323745728, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4670 + }, + { + "epoch": 3.3876221498371337, + "grad_norm": 0.6784825921058655, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 4680 + }, + { + "epoch": 3.394860658704307, + "grad_norm": 0.6362888216972351, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 4690 + }, + { + "epoch": 3.4020991675714805, + "grad_norm": 0.7794778943061829, + "learning_rate": 0.0002, + "loss": 1.2165, + "step": 4700 + }, + { + "epoch": 3.409337676438654, + "grad_norm": 0.7287485003471375, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 4710 + }, + { + "epoch": 3.416576185305827, + "grad_norm": 0.6481451392173767, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 4720 + }, + { + "epoch": 3.4238146941730006, + "grad_norm": 0.9200371503829956, + "learning_rate": 0.0002, + "loss": 1.2121, + "step": 4730 + }, + { + "epoch": 3.431053203040174, + "grad_norm": 1.074180245399475, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 4740 + }, + { + "epoch": 3.438291711907347, + "grad_norm": 0.6722986698150635, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4750 + }, + { + "epoch": 3.44553022077452, + "grad_norm": 0.7945933938026428, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 4760 + }, + { + "epoch": 3.4527687296416936, + "grad_norm": 0.7624640464782715, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 4770 + }, + { + "epoch": 3.460007238508867, + "grad_norm": 0.7763656377792358, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 4780 + }, + { + "epoch": 3.4672457473760403, + "grad_norm": 0.7736947536468506, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 4790 + }, + { + "epoch": 3.4744842562432137, + "grad_norm": 0.8450354933738708, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 4800 + }, + { + "epoch": 3.481722765110387, + "grad_norm": 0.6480133533477783, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 4810 + }, + { + "epoch": 3.4889612739775604, + "grad_norm": 0.8437445759773254, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 4820 + }, + { + "epoch": 3.4961997828447338, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 4830 + }, + { + "epoch": 3.503438291711907, + "grad_norm": 0.8523228168487549, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 4840 + }, + { + "epoch": 3.5106768005790805, + "grad_norm": 0.6236732006072998, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4850 + }, + { + "epoch": 3.517915309446254, + "grad_norm": 0.7500787377357483, + "learning_rate": 0.0002, + "loss": 1.1926, + "step": 4860 + }, + { + "epoch": 3.5251538183134272, + "grad_norm": 0.7665374875068665, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 4870 + }, + { + "epoch": 3.5323923271806006, + "grad_norm": 0.787857711315155, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 4880 + }, + { + "epoch": 3.539630836047774, + "grad_norm": 0.970595121383667, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4890 + }, + { + "epoch": 3.5468693449149473, + "grad_norm": 0.6409347057342529, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 4900 + }, + { + "epoch": 3.5541078537821207, + "grad_norm": 0.888551652431488, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4910 + }, + { + "epoch": 3.561346362649294, + "grad_norm": 1.0808377265930176, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 4920 + }, + { + "epoch": 3.5685848715164674, + "grad_norm": 0.7501053214073181, + "learning_rate": 0.0002, + "loss": 1.2564, + "step": 4930 + }, + { + "epoch": 3.575823380383641, + "grad_norm": 0.7375240325927734, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 4940 + }, + { + "epoch": 3.583061889250814, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 4950 + }, + { + "epoch": 3.5903003981179875, + "grad_norm": 0.939337432384491, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 4960 + }, + { + "epoch": 3.597538906985161, + "grad_norm": 0.6717396974563599, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 4970 + }, + { + "epoch": 3.6047774158523342, + "grad_norm": 0.7141643762588501, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4980 + }, + { + "epoch": 3.6120159247195076, + "grad_norm": 0.7109216451644897, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 4990 + }, + { + "epoch": 3.619254433586681, + "grad_norm": 0.7020776867866516, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 5000 + }, + { + "epoch": 3.6264929424538543, + "grad_norm": 0.7158873677253723, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5010 + }, + { + "epoch": 3.6337314513210277, + "grad_norm": 0.7062035202980042, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 5020 + }, + { + "epoch": 3.640969960188201, + "grad_norm": 0.7081155776977539, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5030 + }, + { + "epoch": 3.6482084690553744, + "grad_norm": 1.2210607528686523, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 5040 + }, + { + "epoch": 3.655446977922548, + "grad_norm": 0.6650236248970032, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5050 + }, + { + "epoch": 3.662685486789721, + "grad_norm": 0.6884829998016357, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 5060 + }, + { + "epoch": 3.6699239956568945, + "grad_norm": 0.7317819595336914, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5070 + }, + { + "epoch": 3.677162504524068, + "grad_norm": 0.7406691908836365, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 5080 + }, + { + "epoch": 3.6844010133912413, + "grad_norm": 0.9009454250335693, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 5090 + }, + { + "epoch": 3.6916395222584146, + "grad_norm": 0.8189385533332825, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 5100 + }, + { + "epoch": 3.698878031125588, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 5110 + }, + { + "epoch": 3.7061165399927614, + "grad_norm": 0.8593027591705322, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5120 + }, + { + "epoch": 3.7133550488599347, + "grad_norm": 0.8481812477111816, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 5130 + }, + { + "epoch": 3.720593557727108, + "grad_norm": 0.6527451276779175, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 5140 + }, + { + "epoch": 3.7278320665942815, + "grad_norm": 0.9220114350318909, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 5150 + }, + { + "epoch": 3.735070575461455, + "grad_norm": 1.0842019319534302, + "learning_rate": 0.0002, + "loss": 1.2267, + "step": 5160 + }, + { + "epoch": 3.742309084328628, + "grad_norm": 0.965453565120697, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 5170 + }, + { + "epoch": 3.7495475931958016, + "grad_norm": 0.9903319478034973, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 5180 + }, + { + "epoch": 3.756786102062975, + "grad_norm": 0.7434818148612976, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 5190 + }, + { + "epoch": 3.7640246109301483, + "grad_norm": 0.6717280745506287, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 5200 + }, + { + "epoch": 3.7712631197973217, + "grad_norm": 0.7754665613174438, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5210 + }, + { + "epoch": 3.778501628664495, + "grad_norm": 1.028374433517456, + "learning_rate": 0.0002, + "loss": 1.305, + "step": 5220 + }, + { + "epoch": 3.7857401375316684, + "grad_norm": 0.6026996374130249, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5230 + }, + { + "epoch": 3.7929786463988417, + "grad_norm": 0.6978490948677063, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 5240 + }, + { + "epoch": 3.800217155266015, + "grad_norm": 0.7303446531295776, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 5250 + }, + { + "epoch": 3.8074556641331885, + "grad_norm": 1.0734210014343262, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 5260 + }, + { + "epoch": 3.814694173000362, + "grad_norm": 0.6383201479911804, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 5270 + }, + { + "epoch": 3.821932681867535, + "grad_norm": 0.7742630243301392, + "learning_rate": 0.0002, + "loss": 1.0904, + "step": 5280 + }, + { + "epoch": 3.8291711907347086, + "grad_norm": 0.8477074503898621, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 5290 + }, + { + "epoch": 3.836409699601882, + "grad_norm": 0.6675317883491516, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 5300 + }, + { + "epoch": 3.8436482084690553, + "grad_norm": 0.7515445351600647, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 5310 + }, + { + "epoch": 3.8508867173362287, + "grad_norm": 1.1441220045089722, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 5320 + }, + { + "epoch": 3.858125226203402, + "grad_norm": 0.7968795895576477, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 5330 + }, + { + "epoch": 3.8653637350705754, + "grad_norm": 0.7842824459075928, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 5340 + }, + { + "epoch": 3.8726022439377488, + "grad_norm": 0.8272225260734558, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5350 + }, + { + "epoch": 3.879840752804922, + "grad_norm": 0.8413397669792175, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 5360 + }, + { + "epoch": 3.8870792616720955, + "grad_norm": 1.141764760017395, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5370 + }, + { + "epoch": 3.894317770539269, + "grad_norm": 0.9826975464820862, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5380 + }, + { + "epoch": 3.9015562794064422, + "grad_norm": 0.8598255515098572, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5390 + }, + { + "epoch": 3.9087947882736156, + "grad_norm": 0.6271058320999146, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 5400 + }, + { + "epoch": 3.916033297140789, + "grad_norm": 0.6379870772361755, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5410 + }, + { + "epoch": 3.9232718060079623, + "grad_norm": 1.0313376188278198, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 5420 + }, + { + "epoch": 3.9305103148751357, + "grad_norm": 0.8220619559288025, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 5430 + }, + { + "epoch": 3.937748823742309, + "grad_norm": 0.7576116919517517, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5440 + }, + { + "epoch": 3.9449873326094824, + "grad_norm": 1.226235032081604, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 5450 + }, + { + "epoch": 3.952225841476656, + "grad_norm": 0.7979229688644409, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5460 + }, + { + "epoch": 3.959464350343829, + "grad_norm": 0.9911929965019226, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 5470 + }, + { + "epoch": 3.9667028592110025, + "grad_norm": 0.643738865852356, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5480 + }, + { + "epoch": 3.973941368078176, + "grad_norm": 0.682305634021759, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 5490 + }, + { + "epoch": 3.9811798769453492, + "grad_norm": 1.18373441696167, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 5500 + }, + { + "epoch": 3.9884183858125226, + "grad_norm": 0.7190203070640564, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5510 + }, + { + "epoch": 3.995656894679696, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 5520 + }, + { + "epoch": 4.0, + "eval_loss": 1.4252897500991821, + "eval_runtime": 27.235, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 5526 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.75387517477847e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-5526/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93f5b07d9711413b652e622cedfedc2581a14bad --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1006d59ff8e2780d4cfe18bd27cdb3f74a23ccb598b0a59612a9fdafd67b6f +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba234c4a424a38b21457b82999e51477d6a086ae --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c12c0218085fe8586d9e929d59be855fc8893669fdddf6a47dce5c47b438eb +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..330e758ffbee81e4ab59edde8112c68541222f1b --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdac20adea3f3dbdec3792643a10befb0d02333cfe6b985cab6bfc8304efd581 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd1fb4610159d98fa0d65c1ddc6944cc68fd2f16 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8708b5027417181075c393e1f571c167fa59f46aebfcf6dfe740a8679bfb73b7 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2bfc3a30615d20a2ab2f8a721f687253c4aa449a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/trainer_state.json @@ -0,0 +1,4903 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 4.999638074556641, + "eval_steps": 10, + "global_step": 6907, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + }, + { + "epoch": 3.0039811798769454, + "grad_norm": 0.7043836116790771, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 4150 + }, + { + "epoch": 3.0112196887441187, + "grad_norm": 0.6806283593177795, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4160 + }, + { + "epoch": 3.018458197611292, + "grad_norm": 0.7684550285339355, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 4170 + }, + { + "epoch": 3.0256967064784654, + "grad_norm": 0.7895237803459167, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4180 + }, + { + "epoch": 3.032935215345639, + "grad_norm": 0.7464531064033508, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 4190 + }, + { + "epoch": 3.040173724212812, + "grad_norm": 0.9358500838279724, + "learning_rate": 0.0002, + "loss": 1.1614, + "step": 4200 + }, + { + "epoch": 3.0474122330799855, + "grad_norm": 1.1066628694534302, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 4210 + }, + { + "epoch": 3.054650741947159, + "grad_norm": 0.6663267612457275, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 4220 + }, + { + "epoch": 3.0618892508143323, + "grad_norm": 0.6669464707374573, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 4230 + }, + { + "epoch": 3.0691277596815056, + "grad_norm": 0.7052164077758789, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 4240 + }, + { + "epoch": 3.076366268548679, + "grad_norm": 0.6118432879447937, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4250 + }, + { + "epoch": 3.0836047774158524, + "grad_norm": 0.6915903687477112, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 4260 + }, + { + "epoch": 3.0908432862830257, + "grad_norm": 0.7441644668579102, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4270 + }, + { + "epoch": 3.098081795150199, + "grad_norm": 0.823850691318512, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4280 + }, + { + "epoch": 3.1053203040173725, + "grad_norm": 0.9677883386611938, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 4290 + }, + { + "epoch": 3.112558812884546, + "grad_norm": 0.7002579569816589, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 4300 + }, + { + "epoch": 3.119797321751719, + "grad_norm": 0.778789758682251, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 4310 + }, + { + "epoch": 3.1270358306188926, + "grad_norm": 0.7236007452011108, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 4320 + }, + { + "epoch": 3.134274339486066, + "grad_norm": 0.8809133768081665, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 4330 + }, + { + "epoch": 3.1415128483532393, + "grad_norm": 0.7924913167953491, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 3.1487513572204127, + "grad_norm": 0.7437422275543213, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4350 + }, + { + "epoch": 3.155989866087586, + "grad_norm": 0.6428450345993042, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 4360 + }, + { + "epoch": 3.1632283749547594, + "grad_norm": 0.7922873497009277, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 4370 + }, + { + "epoch": 3.1704668838219328, + "grad_norm": 0.5252506732940674, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 4380 + }, + { + "epoch": 3.177705392689106, + "grad_norm": 0.8570457696914673, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 4390 + }, + { + "epoch": 3.1849439015562795, + "grad_norm": 0.7218987345695496, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 4400 + }, + { + "epoch": 3.192182410423453, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4410 + }, + { + "epoch": 3.199420919290626, + "grad_norm": 0.7386137843132019, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 4420 + }, + { + "epoch": 3.2066594281577996, + "grad_norm": 0.6227759122848511, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 4430 + }, + { + "epoch": 3.213897937024973, + "grad_norm": 0.7180278897285461, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 4440 + }, + { + "epoch": 3.2211364458921463, + "grad_norm": 0.745830774307251, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 4450 + }, + { + "epoch": 3.2283749547593197, + "grad_norm": 0.6766072511672974, + "learning_rate": 0.0002, + "loss": 1.234, + "step": 4460 + }, + { + "epoch": 3.235613463626493, + "grad_norm": 0.8325067162513733, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 4470 + }, + { + "epoch": 3.2428519724936664, + "grad_norm": 0.7148305177688599, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 4480 + }, + { + "epoch": 3.25009048136084, + "grad_norm": 0.7752676010131836, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 4490 + }, + { + "epoch": 3.257328990228013, + "grad_norm": 0.6776860952377319, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4500 + }, + { + "epoch": 3.2645674990951865, + "grad_norm": 0.704359769821167, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4510 + }, + { + "epoch": 3.27180600796236, + "grad_norm": 0.6880282163619995, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 4520 + }, + { + "epoch": 3.2790445168295332, + "grad_norm": 0.8179270029067993, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 4530 + }, + { + "epoch": 3.2862830256967066, + "grad_norm": 0.6718448996543884, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 4540 + }, + { + "epoch": 3.29352153456388, + "grad_norm": 0.8300657868385315, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4550 + }, + { + "epoch": 3.3007600434310533, + "grad_norm": 0.6433690786361694, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 4560 + }, + { + "epoch": 3.3079985522982267, + "grad_norm": 0.690262496471405, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4570 + }, + { + "epoch": 3.3152370611654, + "grad_norm": 0.7022852301597595, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 4580 + }, + { + "epoch": 3.3224755700325734, + "grad_norm": 0.6438387632369995, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 4590 + }, + { + "epoch": 3.329714078899747, + "grad_norm": 0.6866899132728577, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 4600 + }, + { + "epoch": 3.33695258776692, + "grad_norm": 0.8233968019485474, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 4610 + }, + { + "epoch": 3.3441910966340935, + "grad_norm": 0.7251574993133545, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 4620 + }, + { + "epoch": 3.351429605501267, + "grad_norm": 0.7855110168457031, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4630 + }, + { + "epoch": 3.3586681143684403, + "grad_norm": 0.8487356305122375, + "learning_rate": 0.0002, + "loss": 1.2922, + "step": 4640 + }, + { + "epoch": 3.3659066232356136, + "grad_norm": 0.6429011225700378, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 4650 + }, + { + "epoch": 3.373145132102787, + "grad_norm": 0.7095270156860352, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4660 + }, + { + "epoch": 3.3803836409699604, + "grad_norm": 0.6792303323745728, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4670 + }, + { + "epoch": 3.3876221498371337, + "grad_norm": 0.6784825921058655, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 4680 + }, + { + "epoch": 3.394860658704307, + "grad_norm": 0.6362888216972351, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 4690 + }, + { + "epoch": 3.4020991675714805, + "grad_norm": 0.7794778943061829, + "learning_rate": 0.0002, + "loss": 1.2165, + "step": 4700 + }, + { + "epoch": 3.409337676438654, + "grad_norm": 0.7287485003471375, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 4710 + }, + { + "epoch": 3.416576185305827, + "grad_norm": 0.6481451392173767, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 4720 + }, + { + "epoch": 3.4238146941730006, + "grad_norm": 0.9200371503829956, + "learning_rate": 0.0002, + "loss": 1.2121, + "step": 4730 + }, + { + "epoch": 3.431053203040174, + "grad_norm": 1.074180245399475, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 4740 + }, + { + "epoch": 3.438291711907347, + "grad_norm": 0.6722986698150635, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4750 + }, + { + "epoch": 3.44553022077452, + "grad_norm": 0.7945933938026428, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 4760 + }, + { + "epoch": 3.4527687296416936, + "grad_norm": 0.7624640464782715, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 4770 + }, + { + "epoch": 3.460007238508867, + "grad_norm": 0.7763656377792358, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 4780 + }, + { + "epoch": 3.4672457473760403, + "grad_norm": 0.7736947536468506, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 4790 + }, + { + "epoch": 3.4744842562432137, + "grad_norm": 0.8450354933738708, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 4800 + }, + { + "epoch": 3.481722765110387, + "grad_norm": 0.6480133533477783, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 4810 + }, + { + "epoch": 3.4889612739775604, + "grad_norm": 0.8437445759773254, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 4820 + }, + { + "epoch": 3.4961997828447338, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 4830 + }, + { + "epoch": 3.503438291711907, + "grad_norm": 0.8523228168487549, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 4840 + }, + { + "epoch": 3.5106768005790805, + "grad_norm": 0.6236732006072998, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4850 + }, + { + "epoch": 3.517915309446254, + "grad_norm": 0.7500787377357483, + "learning_rate": 0.0002, + "loss": 1.1926, + "step": 4860 + }, + { + "epoch": 3.5251538183134272, + "grad_norm": 0.7665374875068665, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 4870 + }, + { + "epoch": 3.5323923271806006, + "grad_norm": 0.787857711315155, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 4880 + }, + { + "epoch": 3.539630836047774, + "grad_norm": 0.970595121383667, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4890 + }, + { + "epoch": 3.5468693449149473, + "grad_norm": 0.6409347057342529, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 4900 + }, + { + "epoch": 3.5541078537821207, + "grad_norm": 0.888551652431488, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4910 + }, + { + "epoch": 3.561346362649294, + "grad_norm": 1.0808377265930176, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 4920 + }, + { + "epoch": 3.5685848715164674, + "grad_norm": 0.7501053214073181, + "learning_rate": 0.0002, + "loss": 1.2564, + "step": 4930 + }, + { + "epoch": 3.575823380383641, + "grad_norm": 0.7375240325927734, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 4940 + }, + { + "epoch": 3.583061889250814, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 4950 + }, + { + "epoch": 3.5903003981179875, + "grad_norm": 0.939337432384491, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 4960 + }, + { + "epoch": 3.597538906985161, + "grad_norm": 0.6717396974563599, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 4970 + }, + { + "epoch": 3.6047774158523342, + "grad_norm": 0.7141643762588501, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4980 + }, + { + "epoch": 3.6120159247195076, + "grad_norm": 0.7109216451644897, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 4990 + }, + { + "epoch": 3.619254433586681, + "grad_norm": 0.7020776867866516, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 5000 + }, + { + "epoch": 3.6264929424538543, + "grad_norm": 0.7158873677253723, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5010 + }, + { + "epoch": 3.6337314513210277, + "grad_norm": 0.7062035202980042, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 5020 + }, + { + "epoch": 3.640969960188201, + "grad_norm": 0.7081155776977539, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5030 + }, + { + "epoch": 3.6482084690553744, + "grad_norm": 1.2210607528686523, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 5040 + }, + { + "epoch": 3.655446977922548, + "grad_norm": 0.6650236248970032, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5050 + }, + { + "epoch": 3.662685486789721, + "grad_norm": 0.6884829998016357, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 5060 + }, + { + "epoch": 3.6699239956568945, + "grad_norm": 0.7317819595336914, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5070 + }, + { + "epoch": 3.677162504524068, + "grad_norm": 0.7406691908836365, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 5080 + }, + { + "epoch": 3.6844010133912413, + "grad_norm": 0.9009454250335693, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 5090 + }, + { + "epoch": 3.6916395222584146, + "grad_norm": 0.8189385533332825, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 5100 + }, + { + "epoch": 3.698878031125588, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 5110 + }, + { + "epoch": 3.7061165399927614, + "grad_norm": 0.8593027591705322, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5120 + }, + { + "epoch": 3.7133550488599347, + "grad_norm": 0.8481812477111816, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 5130 + }, + { + "epoch": 3.720593557727108, + "grad_norm": 0.6527451276779175, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 5140 + }, + { + "epoch": 3.7278320665942815, + "grad_norm": 0.9220114350318909, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 5150 + }, + { + "epoch": 3.735070575461455, + "grad_norm": 1.0842019319534302, + "learning_rate": 0.0002, + "loss": 1.2267, + "step": 5160 + }, + { + "epoch": 3.742309084328628, + "grad_norm": 0.965453565120697, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 5170 + }, + { + "epoch": 3.7495475931958016, + "grad_norm": 0.9903319478034973, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 5180 + }, + { + "epoch": 3.756786102062975, + "grad_norm": 0.7434818148612976, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 5190 + }, + { + "epoch": 3.7640246109301483, + "grad_norm": 0.6717280745506287, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 5200 + }, + { + "epoch": 3.7712631197973217, + "grad_norm": 0.7754665613174438, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5210 + }, + { + "epoch": 3.778501628664495, + "grad_norm": 1.028374433517456, + "learning_rate": 0.0002, + "loss": 1.305, + "step": 5220 + }, + { + "epoch": 3.7857401375316684, + "grad_norm": 0.6026996374130249, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5230 + }, + { + "epoch": 3.7929786463988417, + "grad_norm": 0.6978490948677063, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 5240 + }, + { + "epoch": 3.800217155266015, + "grad_norm": 0.7303446531295776, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 5250 + }, + { + "epoch": 3.8074556641331885, + "grad_norm": 1.0734210014343262, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 5260 + }, + { + "epoch": 3.814694173000362, + "grad_norm": 0.6383201479911804, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 5270 + }, + { + "epoch": 3.821932681867535, + "grad_norm": 0.7742630243301392, + "learning_rate": 0.0002, + "loss": 1.0904, + "step": 5280 + }, + { + "epoch": 3.8291711907347086, + "grad_norm": 0.8477074503898621, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 5290 + }, + { + "epoch": 3.836409699601882, + "grad_norm": 0.6675317883491516, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 5300 + }, + { + "epoch": 3.8436482084690553, + "grad_norm": 0.7515445351600647, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 5310 + }, + { + "epoch": 3.8508867173362287, + "grad_norm": 1.1441220045089722, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 5320 + }, + { + "epoch": 3.858125226203402, + "grad_norm": 0.7968795895576477, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 5330 + }, + { + "epoch": 3.8653637350705754, + "grad_norm": 0.7842824459075928, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 5340 + }, + { + "epoch": 3.8726022439377488, + "grad_norm": 0.8272225260734558, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5350 + }, + { + "epoch": 3.879840752804922, + "grad_norm": 0.8413397669792175, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 5360 + }, + { + "epoch": 3.8870792616720955, + "grad_norm": 1.141764760017395, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5370 + }, + { + "epoch": 3.894317770539269, + "grad_norm": 0.9826975464820862, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5380 + }, + { + "epoch": 3.9015562794064422, + "grad_norm": 0.8598255515098572, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5390 + }, + { + "epoch": 3.9087947882736156, + "grad_norm": 0.6271058320999146, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 5400 + }, + { + "epoch": 3.916033297140789, + "grad_norm": 0.6379870772361755, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5410 + }, + { + "epoch": 3.9232718060079623, + "grad_norm": 1.0313376188278198, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 5420 + }, + { + "epoch": 3.9305103148751357, + "grad_norm": 0.8220619559288025, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 5430 + }, + { + "epoch": 3.937748823742309, + "grad_norm": 0.7576116919517517, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5440 + }, + { + "epoch": 3.9449873326094824, + "grad_norm": 1.226235032081604, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 5450 + }, + { + "epoch": 3.952225841476656, + "grad_norm": 0.7979229688644409, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5460 + }, + { + "epoch": 3.959464350343829, + "grad_norm": 0.9911929965019226, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 5470 + }, + { + "epoch": 3.9667028592110025, + "grad_norm": 0.643738865852356, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5480 + }, + { + "epoch": 3.973941368078176, + "grad_norm": 0.682305634021759, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 5490 + }, + { + "epoch": 3.9811798769453492, + "grad_norm": 1.18373441696167, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 5500 + }, + { + "epoch": 3.9884183858125226, + "grad_norm": 0.7190203070640564, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5510 + }, + { + "epoch": 3.995656894679696, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 5520 + }, + { + "epoch": 4.0, + "eval_loss": 1.4252897500991821, + "eval_runtime": 27.235, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 5526 + }, + { + "epoch": 4.002895403546869, + "grad_norm": 0.6353074312210083, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 5530 + }, + { + "epoch": 4.010133912414043, + "grad_norm": 0.7424906492233276, + "learning_rate": 0.0002, + "loss": 1.0326, + "step": 5540 + }, + { + "epoch": 4.017372421281216, + "grad_norm": 0.8856638073921204, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5550 + }, + { + "epoch": 4.024610930148389, + "grad_norm": 0.9627974033355713, + "learning_rate": 0.0002, + "loss": 1.0905, + "step": 5560 + }, + { + "epoch": 4.031849439015563, + "grad_norm": 0.9048978686332703, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5570 + }, + { + "epoch": 4.039087947882736, + "grad_norm": 0.921119213104248, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 5580 + }, + { + "epoch": 4.0463264567499095, + "grad_norm": 0.8654361963272095, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 5590 + }, + { + "epoch": 4.053564965617083, + "grad_norm": 0.7947945594787598, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 5600 + }, + { + "epoch": 4.060803474484256, + "grad_norm": 0.8307326436042786, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 5610 + }, + { + "epoch": 4.06804198335143, + "grad_norm": 0.793273389339447, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 5620 + }, + { + "epoch": 4.075280492218603, + "grad_norm": 0.8748673796653748, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.082519001085776, + "grad_norm": 0.7926856279373169, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 5640 + }, + { + "epoch": 4.08975750995295, + "grad_norm": 0.922645092010498, + "learning_rate": 0.0002, + "loss": 1.044, + "step": 5650 + }, + { + "epoch": 4.096996018820123, + "grad_norm": 0.9539641737937927, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5660 + }, + { + "epoch": 4.1042345276872965, + "grad_norm": 0.8674443364143372, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 5670 + }, + { + "epoch": 4.11147303655447, + "grad_norm": 0.7097609043121338, + "learning_rate": 0.0002, + "loss": 0.9867, + "step": 5680 + }, + { + "epoch": 4.118711545421643, + "grad_norm": 0.8875522613525391, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 5690 + }, + { + "epoch": 4.125950054288817, + "grad_norm": 0.8583634495735168, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 5700 + }, + { + "epoch": 4.13318856315599, + "grad_norm": 0.6736377477645874, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 5710 + }, + { + "epoch": 4.140427072023163, + "grad_norm": 0.9349062442779541, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 5720 + }, + { + "epoch": 4.147665580890337, + "grad_norm": 1.0610365867614746, + "learning_rate": 0.0002, + "loss": 1.0508, + "step": 5730 + }, + { + "epoch": 4.15490408975751, + "grad_norm": 1.5838189125061035, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 5740 + }, + { + "epoch": 4.162142598624683, + "grad_norm": 0.747522234916687, + "learning_rate": 0.0002, + "loss": 1.0222, + "step": 5750 + }, + { + "epoch": 4.169381107491857, + "grad_norm": 1.3247915506362915, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 5760 + }, + { + "epoch": 4.17661961635903, + "grad_norm": 0.8750247955322266, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 5770 + }, + { + "epoch": 4.1838581252262035, + "grad_norm": 0.7914144992828369, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 5780 + }, + { + "epoch": 4.191096634093377, + "grad_norm": 0.9493299126625061, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 5790 + }, + { + "epoch": 4.19833514296055, + "grad_norm": 0.7802295088768005, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 5800 + }, + { + "epoch": 4.205573651827724, + "grad_norm": 0.6987314820289612, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 5810 + }, + { + "epoch": 4.212812160694897, + "grad_norm": 0.9220341444015503, + "learning_rate": 0.0002, + "loss": 1.1699, + "step": 5820 + }, + { + "epoch": 4.22005066956207, + "grad_norm": 0.8932939767837524, + "learning_rate": 0.0002, + "loss": 1.1394, + "step": 5830 + }, + { + "epoch": 4.227289178429244, + "grad_norm": 0.920002818107605, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 5840 + }, + { + "epoch": 4.234527687296417, + "grad_norm": 0.6662752032279968, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 5850 + }, + { + "epoch": 4.24176619616359, + "grad_norm": 0.8679718971252441, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 5860 + }, + { + "epoch": 4.249004705030764, + "grad_norm": 0.7020887732505798, + "learning_rate": 0.0002, + "loss": 0.8991, + "step": 5870 + }, + { + "epoch": 4.256243213897937, + "grad_norm": 0.869611382484436, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 5880 + }, + { + "epoch": 4.2634817227651105, + "grad_norm": 0.7796585559844971, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 5890 + }, + { + "epoch": 4.270720231632284, + "grad_norm": 0.8978819251060486, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 5900 + }, + { + "epoch": 4.277958740499457, + "grad_norm": 1.0837205648422241, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 5910 + }, + { + "epoch": 4.285197249366631, + "grad_norm": 0.7584353089332581, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5920 + }, + { + "epoch": 4.292435758233804, + "grad_norm": 0.7313185334205627, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5930 + }, + { + "epoch": 4.299674267100977, + "grad_norm": 0.8004671335220337, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 5940 + }, + { + "epoch": 4.306912775968151, + "grad_norm": 2.154958724975586, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 5950 + }, + { + "epoch": 4.314151284835324, + "grad_norm": 0.9163479804992676, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 5960 + }, + { + "epoch": 4.321389793702497, + "grad_norm": 0.9151589274406433, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 5970 + }, + { + "epoch": 4.328628302569671, + "grad_norm": 0.8624112010002136, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 5980 + }, + { + "epoch": 4.335866811436844, + "grad_norm": 0.9357741475105286, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 5990 + }, + { + "epoch": 4.3431053203040175, + "grad_norm": 1.3482335805892944, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 6000 + }, + { + "epoch": 4.350343829171191, + "grad_norm": 0.7156149744987488, + "learning_rate": 0.0002, + "loss": 1.1224, + "step": 6010 + }, + { + "epoch": 4.357582338038364, + "grad_norm": 0.8480049967765808, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6020 + }, + { + "epoch": 4.364820846905538, + "grad_norm": 0.8262244462966919, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 6030 + }, + { + "epoch": 4.372059355772711, + "grad_norm": 0.7733905911445618, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 6040 + }, + { + "epoch": 4.379297864639884, + "grad_norm": 0.8553919792175293, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 6050 + }, + { + "epoch": 4.386536373507058, + "grad_norm": 0.8666832447052002, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 6060 + }, + { + "epoch": 4.393774882374231, + "grad_norm": 0.9168295860290527, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6070 + }, + { + "epoch": 4.4010133912414044, + "grad_norm": 0.7315238118171692, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 6080 + }, + { + "epoch": 4.408251900108578, + "grad_norm": 1.020263433456421, + "learning_rate": 0.0002, + "loss": 1.1599, + "step": 6090 + }, + { + "epoch": 4.415490408975751, + "grad_norm": 0.9978243708610535, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 6100 + }, + { + "epoch": 4.4227289178429245, + "grad_norm": 0.995453953742981, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 6110 + }, + { + "epoch": 4.429967426710098, + "grad_norm": 0.9360884428024292, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6120 + }, + { + "epoch": 4.437205935577271, + "grad_norm": 0.8099448084831238, + "learning_rate": 0.0002, + "loss": 0.9506, + "step": 6130 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8173841238021851, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 6140 + }, + { + "epoch": 4.451682953311618, + "grad_norm": 0.7972666025161743, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 6150 + }, + { + "epoch": 4.458921462178791, + "grad_norm": 0.7685779333114624, + "learning_rate": 0.0002, + "loss": 1.0226, + "step": 6160 + }, + { + "epoch": 4.466159971045965, + "grad_norm": 0.7872623801231384, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6170 + }, + { + "epoch": 4.473398479913138, + "grad_norm": 0.7677070498466492, + "learning_rate": 0.0002, + "loss": 0.9911, + "step": 6180 + }, + { + "epoch": 4.4806369887803115, + "grad_norm": 0.7878316044807434, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 6190 + }, + { + "epoch": 4.487875497647485, + "grad_norm": 0.8178079724311829, + "learning_rate": 0.0002, + "loss": 1.018, + "step": 6200 + }, + { + "epoch": 4.495114006514658, + "grad_norm": 1.2820082902908325, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 6210 + }, + { + "epoch": 4.502352515381832, + "grad_norm": 0.9380832314491272, + "learning_rate": 0.0002, + "loss": 1.3101, + "step": 6220 + }, + { + "epoch": 4.509591024249005, + "grad_norm": 0.7810422778129578, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 6230 + }, + { + "epoch": 4.516829533116178, + "grad_norm": 1.1022917032241821, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6240 + }, + { + "epoch": 4.524068041983352, + "grad_norm": 1.4275553226470947, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 6250 + }, + { + "epoch": 4.531306550850525, + "grad_norm": 0.7597777247428894, + "learning_rate": 0.0002, + "loss": 1.3237, + "step": 6260 + }, + { + "epoch": 4.538545059717698, + "grad_norm": 1.10992431640625, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 6270 + }, + { + "epoch": 4.545783568584872, + "grad_norm": 0.8981178998947144, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6280 + }, + { + "epoch": 4.553022077452045, + "grad_norm": 0.7863979339599609, + "learning_rate": 0.0002, + "loss": 1.086, + "step": 6290 + }, + { + "epoch": 4.5602605863192185, + "grad_norm": 0.9071474671363831, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 6300 + }, + { + "epoch": 4.567499095186392, + "grad_norm": 0.7429424524307251, + "learning_rate": 0.0002, + "loss": 1.0916, + "step": 6310 + }, + { + "epoch": 4.574737604053565, + "grad_norm": 1.0767850875854492, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 6320 + }, + { + "epoch": 4.581976112920739, + "grad_norm": 0.7885915637016296, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 6330 + }, + { + "epoch": 4.589214621787912, + "grad_norm": 0.8350457549095154, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 6340 + }, + { + "epoch": 4.596453130655085, + "grad_norm": 0.7853530645370483, + "learning_rate": 0.0002, + "loss": 1.0743, + "step": 6350 + }, + { + "epoch": 4.603691639522259, + "grad_norm": 1.1220661401748657, + "learning_rate": 0.0002, + "loss": 1.1912, + "step": 6360 + }, + { + "epoch": 4.610930148389432, + "grad_norm": 0.7959423065185547, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 6370 + }, + { + "epoch": 4.618168657256605, + "grad_norm": 0.7782652378082275, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6380 + }, + { + "epoch": 4.625407166123779, + "grad_norm": 0.7882203459739685, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6390 + }, + { + "epoch": 4.632645674990952, + "grad_norm": 0.8841899037361145, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 6400 + }, + { + "epoch": 4.6398841838581255, + "grad_norm": 0.7936127781867981, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 6410 + }, + { + "epoch": 4.647122692725299, + "grad_norm": 0.9213966131210327, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 6420 + }, + { + "epoch": 4.654361201592472, + "grad_norm": 0.9246473908424377, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 6430 + }, + { + "epoch": 4.661599710459646, + "grad_norm": 0.766572892665863, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 6440 + }, + { + "epoch": 4.668838219326819, + "grad_norm": 0.8596171736717224, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 6450 + }, + { + "epoch": 4.676076728193992, + "grad_norm": 0.8482751846313477, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6460 + }, + { + "epoch": 4.683315237061166, + "grad_norm": 1.0826905965805054, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 6470 + }, + { + "epoch": 4.690553745928339, + "grad_norm": 1.1048457622528076, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 6480 + }, + { + "epoch": 4.697792254795512, + "grad_norm": 0.9429134726524353, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6490 + }, + { + "epoch": 4.705030763662686, + "grad_norm": 0.8587502837181091, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 6500 + }, + { + "epoch": 4.712269272529859, + "grad_norm": 1.0387083292007446, + "learning_rate": 0.0002, + "loss": 1.0969, + "step": 6510 + }, + { + "epoch": 4.7195077813970325, + "grad_norm": 0.7471951842308044, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 6520 + }, + { + "epoch": 4.726746290264206, + "grad_norm": 0.8800424933433533, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6530 + }, + { + "epoch": 4.733984799131379, + "grad_norm": 0.8136811852455139, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 6540 + }, + { + "epoch": 4.741223307998553, + "grad_norm": 0.9910339713096619, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 6550 + }, + { + "epoch": 4.748461816865726, + "grad_norm": 1.0679163932800293, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 6560 + }, + { + "epoch": 4.755700325732899, + "grad_norm": 0.8468248248100281, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 6570 + }, + { + "epoch": 4.762938834600073, + "grad_norm": 0.8771235942840576, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 6580 + }, + { + "epoch": 4.770177343467246, + "grad_norm": 0.7024846076965332, + "learning_rate": 0.0002, + "loss": 1.077, + "step": 6590 + }, + { + "epoch": 4.7774158523344195, + "grad_norm": 0.7836683392524719, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 6600 + }, + { + "epoch": 4.784654361201593, + "grad_norm": 0.7717288136482239, + "learning_rate": 0.0002, + "loss": 1.1006, + "step": 6610 + }, + { + "epoch": 4.791892870068766, + "grad_norm": 0.884183943271637, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 6620 + }, + { + "epoch": 4.7991313789359396, + "grad_norm": 1.383867621421814, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 6630 + }, + { + "epoch": 4.806369887803113, + "grad_norm": 0.9741523861885071, + "learning_rate": 0.0002, + "loss": 1.0861, + "step": 6640 + }, + { + "epoch": 4.813608396670286, + "grad_norm": 0.9723693132400513, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 6650 + }, + { + "epoch": 4.82084690553746, + "grad_norm": 1.8324809074401855, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 6660 + }, + { + "epoch": 4.828085414404633, + "grad_norm": 0.904909074306488, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 6670 + }, + { + "epoch": 4.835323923271806, + "grad_norm": 0.7355411648750305, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 6680 + }, + { + "epoch": 4.84256243213898, + "grad_norm": 0.8934960961341858, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 6690 + }, + { + "epoch": 4.849800941006153, + "grad_norm": 1.4596954584121704, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6700 + }, + { + "epoch": 4.8570394498733265, + "grad_norm": 0.8310341238975525, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 6710 + }, + { + "epoch": 4.8642779587405, + "grad_norm": 0.9709894061088562, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 6720 + }, + { + "epoch": 4.871516467607673, + "grad_norm": 0.852142333984375, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 6730 + }, + { + "epoch": 4.878754976474847, + "grad_norm": 1.0643625259399414, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 6740 + }, + { + "epoch": 4.88599348534202, + "grad_norm": 0.9419508576393127, + "learning_rate": 0.0002, + "loss": 1.056, + "step": 6750 + }, + { + "epoch": 4.893231994209193, + "grad_norm": 1.1818498373031616, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 6760 + }, + { + "epoch": 4.900470503076367, + "grad_norm": 0.9369569420814514, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 6770 + }, + { + "epoch": 4.90770901194354, + "grad_norm": 0.7012579441070557, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 6780 + }, + { + "epoch": 4.914947520810713, + "grad_norm": 0.9109319448471069, + "learning_rate": 0.0002, + "loss": 1.0926, + "step": 6790 + }, + { + "epoch": 4.922186029677887, + "grad_norm": 0.8077534437179565, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 6800 + }, + { + "epoch": 4.92942453854506, + "grad_norm": 0.7571148872375488, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 6810 + }, + { + "epoch": 4.9366630474122335, + "grad_norm": 0.7325633764266968, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 6820 + }, + { + "epoch": 4.943901556279407, + "grad_norm": 0.8465084433555603, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 6830 + }, + { + "epoch": 4.95114006514658, + "grad_norm": 0.8753737807273865, + "learning_rate": 0.0002, + "loss": 1.153, + "step": 6840 + }, + { + "epoch": 4.958378574013754, + "grad_norm": 0.9421748518943787, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 6850 + }, + { + "epoch": 4.965617082880927, + "grad_norm": 0.8245896697044373, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 6860 + }, + { + "epoch": 4.9728555917481, + "grad_norm": 0.8823089599609375, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 6870 + }, + { + "epoch": 4.980094100615274, + "grad_norm": 0.8406389355659485, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 6880 + }, + { + "epoch": 4.987332609482447, + "grad_norm": 0.9732868075370789, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 6890 + }, + { + "epoch": 4.99457111834962, + "grad_norm": 2.125141143798828, + "learning_rate": 0.0002, + "loss": 1.1776, + "step": 6900 + }, + { + "epoch": 4.999638074556641, + "eval_loss": 1.445176601409912, + "eval_runtime": 27.2351, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 6907 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.442343968473088e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-6907/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb32d048077f7d36cfe26b386f4c5c7c8584ba1d --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e29c954dbe5c796580cbf124ef520cc237dd5b537fb2b50a51fcecc829d770 +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ecc859b0d2dee022050e34694df22248804ffd1 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408bb73e1e7320415d913dbf2d032b37fee48d77d984e0a4b275d0aa8a937b1b +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..91dd0cfc4765c5fba307f74c5667ab094ea9162c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477d843dcd5d8c3d2382e25e5ea3a1d9e518dc0db552a7775a088d3410a82d91 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2113435227a989284d1334d01f058630811bf130 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6faa4422eb7d977fd89c4cc4434e03971817a911178543ca18703285022fa252 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33541a1db43cf7eae2af7f68b07ea57eb9740b96 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/trainer_state.json @@ -0,0 +1,5877 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8289, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + }, + { + "epoch": 3.0039811798769454, + "grad_norm": 0.7043836116790771, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 4150 + }, + { + "epoch": 3.0112196887441187, + "grad_norm": 0.6806283593177795, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4160 + }, + { + "epoch": 3.018458197611292, + "grad_norm": 0.7684550285339355, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 4170 + }, + { + "epoch": 3.0256967064784654, + "grad_norm": 0.7895237803459167, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4180 + }, + { + "epoch": 3.032935215345639, + "grad_norm": 0.7464531064033508, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 4190 + }, + { + "epoch": 3.040173724212812, + "grad_norm": 0.9358500838279724, + "learning_rate": 0.0002, + "loss": 1.1614, + "step": 4200 + }, + { + "epoch": 3.0474122330799855, + "grad_norm": 1.1066628694534302, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 4210 + }, + { + "epoch": 3.054650741947159, + "grad_norm": 0.6663267612457275, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 4220 + }, + { + "epoch": 3.0618892508143323, + "grad_norm": 0.6669464707374573, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 4230 + }, + { + "epoch": 3.0691277596815056, + "grad_norm": 0.7052164077758789, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 4240 + }, + { + "epoch": 3.076366268548679, + "grad_norm": 0.6118432879447937, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4250 + }, + { + "epoch": 3.0836047774158524, + "grad_norm": 0.6915903687477112, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 4260 + }, + { + "epoch": 3.0908432862830257, + "grad_norm": 0.7441644668579102, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4270 + }, + { + "epoch": 3.098081795150199, + "grad_norm": 0.823850691318512, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4280 + }, + { + "epoch": 3.1053203040173725, + "grad_norm": 0.9677883386611938, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 4290 + }, + { + "epoch": 3.112558812884546, + "grad_norm": 0.7002579569816589, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 4300 + }, + { + "epoch": 3.119797321751719, + "grad_norm": 0.778789758682251, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 4310 + }, + { + "epoch": 3.1270358306188926, + "grad_norm": 0.7236007452011108, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 4320 + }, + { + "epoch": 3.134274339486066, + "grad_norm": 0.8809133768081665, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 4330 + }, + { + "epoch": 3.1415128483532393, + "grad_norm": 0.7924913167953491, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 3.1487513572204127, + "grad_norm": 0.7437422275543213, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4350 + }, + { + "epoch": 3.155989866087586, + "grad_norm": 0.6428450345993042, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 4360 + }, + { + "epoch": 3.1632283749547594, + "grad_norm": 0.7922873497009277, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 4370 + }, + { + "epoch": 3.1704668838219328, + "grad_norm": 0.5252506732940674, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 4380 + }, + { + "epoch": 3.177705392689106, + "grad_norm": 0.8570457696914673, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 4390 + }, + { + "epoch": 3.1849439015562795, + "grad_norm": 0.7218987345695496, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 4400 + }, + { + "epoch": 3.192182410423453, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4410 + }, + { + "epoch": 3.199420919290626, + "grad_norm": 0.7386137843132019, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 4420 + }, + { + "epoch": 3.2066594281577996, + "grad_norm": 0.6227759122848511, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 4430 + }, + { + "epoch": 3.213897937024973, + "grad_norm": 0.7180278897285461, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 4440 + }, + { + "epoch": 3.2211364458921463, + "grad_norm": 0.745830774307251, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 4450 + }, + { + "epoch": 3.2283749547593197, + "grad_norm": 0.6766072511672974, + "learning_rate": 0.0002, + "loss": 1.234, + "step": 4460 + }, + { + "epoch": 3.235613463626493, + "grad_norm": 0.8325067162513733, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 4470 + }, + { + "epoch": 3.2428519724936664, + "grad_norm": 0.7148305177688599, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 4480 + }, + { + "epoch": 3.25009048136084, + "grad_norm": 0.7752676010131836, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 4490 + }, + { + "epoch": 3.257328990228013, + "grad_norm": 0.6776860952377319, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4500 + }, + { + "epoch": 3.2645674990951865, + "grad_norm": 0.704359769821167, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4510 + }, + { + "epoch": 3.27180600796236, + "grad_norm": 0.6880282163619995, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 4520 + }, + { + "epoch": 3.2790445168295332, + "grad_norm": 0.8179270029067993, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 4530 + }, + { + "epoch": 3.2862830256967066, + "grad_norm": 0.6718448996543884, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 4540 + }, + { + "epoch": 3.29352153456388, + "grad_norm": 0.8300657868385315, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4550 + }, + { + "epoch": 3.3007600434310533, + "grad_norm": 0.6433690786361694, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 4560 + }, + { + "epoch": 3.3079985522982267, + "grad_norm": 0.690262496471405, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4570 + }, + { + "epoch": 3.3152370611654, + "grad_norm": 0.7022852301597595, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 4580 + }, + { + "epoch": 3.3224755700325734, + "grad_norm": 0.6438387632369995, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 4590 + }, + { + "epoch": 3.329714078899747, + "grad_norm": 0.6866899132728577, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 4600 + }, + { + "epoch": 3.33695258776692, + "grad_norm": 0.8233968019485474, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 4610 + }, + { + "epoch": 3.3441910966340935, + "grad_norm": 0.7251574993133545, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 4620 + }, + { + "epoch": 3.351429605501267, + "grad_norm": 0.7855110168457031, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4630 + }, + { + "epoch": 3.3586681143684403, + "grad_norm": 0.8487356305122375, + "learning_rate": 0.0002, + "loss": 1.2922, + "step": 4640 + }, + { + "epoch": 3.3659066232356136, + "grad_norm": 0.6429011225700378, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 4650 + }, + { + "epoch": 3.373145132102787, + "grad_norm": 0.7095270156860352, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4660 + }, + { + "epoch": 3.3803836409699604, + "grad_norm": 0.6792303323745728, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4670 + }, + { + "epoch": 3.3876221498371337, + "grad_norm": 0.6784825921058655, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 4680 + }, + { + "epoch": 3.394860658704307, + "grad_norm": 0.6362888216972351, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 4690 + }, + { + "epoch": 3.4020991675714805, + "grad_norm": 0.7794778943061829, + "learning_rate": 0.0002, + "loss": 1.2165, + "step": 4700 + }, + { + "epoch": 3.409337676438654, + "grad_norm": 0.7287485003471375, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 4710 + }, + { + "epoch": 3.416576185305827, + "grad_norm": 0.6481451392173767, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 4720 + }, + { + "epoch": 3.4238146941730006, + "grad_norm": 0.9200371503829956, + "learning_rate": 0.0002, + "loss": 1.2121, + "step": 4730 + }, + { + "epoch": 3.431053203040174, + "grad_norm": 1.074180245399475, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 4740 + }, + { + "epoch": 3.438291711907347, + "grad_norm": 0.6722986698150635, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4750 + }, + { + "epoch": 3.44553022077452, + "grad_norm": 0.7945933938026428, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 4760 + }, + { + "epoch": 3.4527687296416936, + "grad_norm": 0.7624640464782715, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 4770 + }, + { + "epoch": 3.460007238508867, + "grad_norm": 0.7763656377792358, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 4780 + }, + { + "epoch": 3.4672457473760403, + "grad_norm": 0.7736947536468506, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 4790 + }, + { + "epoch": 3.4744842562432137, + "grad_norm": 0.8450354933738708, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 4800 + }, + { + "epoch": 3.481722765110387, + "grad_norm": 0.6480133533477783, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 4810 + }, + { + "epoch": 3.4889612739775604, + "grad_norm": 0.8437445759773254, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 4820 + }, + { + "epoch": 3.4961997828447338, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 4830 + }, + { + "epoch": 3.503438291711907, + "grad_norm": 0.8523228168487549, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 4840 + }, + { + "epoch": 3.5106768005790805, + "grad_norm": 0.6236732006072998, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4850 + }, + { + "epoch": 3.517915309446254, + "grad_norm": 0.7500787377357483, + "learning_rate": 0.0002, + "loss": 1.1926, + "step": 4860 + }, + { + "epoch": 3.5251538183134272, + "grad_norm": 0.7665374875068665, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 4870 + }, + { + "epoch": 3.5323923271806006, + "grad_norm": 0.787857711315155, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 4880 + }, + { + "epoch": 3.539630836047774, + "grad_norm": 0.970595121383667, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4890 + }, + { + "epoch": 3.5468693449149473, + "grad_norm": 0.6409347057342529, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 4900 + }, + { + "epoch": 3.5541078537821207, + "grad_norm": 0.888551652431488, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4910 + }, + { + "epoch": 3.561346362649294, + "grad_norm": 1.0808377265930176, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 4920 + }, + { + "epoch": 3.5685848715164674, + "grad_norm": 0.7501053214073181, + "learning_rate": 0.0002, + "loss": 1.2564, + "step": 4930 + }, + { + "epoch": 3.575823380383641, + "grad_norm": 0.7375240325927734, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 4940 + }, + { + "epoch": 3.583061889250814, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 4950 + }, + { + "epoch": 3.5903003981179875, + "grad_norm": 0.939337432384491, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 4960 + }, + { + "epoch": 3.597538906985161, + "grad_norm": 0.6717396974563599, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 4970 + }, + { + "epoch": 3.6047774158523342, + "grad_norm": 0.7141643762588501, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4980 + }, + { + "epoch": 3.6120159247195076, + "grad_norm": 0.7109216451644897, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 4990 + }, + { + "epoch": 3.619254433586681, + "grad_norm": 0.7020776867866516, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 5000 + }, + { + "epoch": 3.6264929424538543, + "grad_norm": 0.7158873677253723, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5010 + }, + { + "epoch": 3.6337314513210277, + "grad_norm": 0.7062035202980042, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 5020 + }, + { + "epoch": 3.640969960188201, + "grad_norm": 0.7081155776977539, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5030 + }, + { + "epoch": 3.6482084690553744, + "grad_norm": 1.2210607528686523, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 5040 + }, + { + "epoch": 3.655446977922548, + "grad_norm": 0.6650236248970032, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5050 + }, + { + "epoch": 3.662685486789721, + "grad_norm": 0.6884829998016357, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 5060 + }, + { + "epoch": 3.6699239956568945, + "grad_norm": 0.7317819595336914, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5070 + }, + { + "epoch": 3.677162504524068, + "grad_norm": 0.7406691908836365, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 5080 + }, + { + "epoch": 3.6844010133912413, + "grad_norm": 0.9009454250335693, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 5090 + }, + { + "epoch": 3.6916395222584146, + "grad_norm": 0.8189385533332825, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 5100 + }, + { + "epoch": 3.698878031125588, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 5110 + }, + { + "epoch": 3.7061165399927614, + "grad_norm": 0.8593027591705322, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5120 + }, + { + "epoch": 3.7133550488599347, + "grad_norm": 0.8481812477111816, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 5130 + }, + { + "epoch": 3.720593557727108, + "grad_norm": 0.6527451276779175, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 5140 + }, + { + "epoch": 3.7278320665942815, + "grad_norm": 0.9220114350318909, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 5150 + }, + { + "epoch": 3.735070575461455, + "grad_norm": 1.0842019319534302, + "learning_rate": 0.0002, + "loss": 1.2267, + "step": 5160 + }, + { + "epoch": 3.742309084328628, + "grad_norm": 0.965453565120697, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 5170 + }, + { + "epoch": 3.7495475931958016, + "grad_norm": 0.9903319478034973, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 5180 + }, + { + "epoch": 3.756786102062975, + "grad_norm": 0.7434818148612976, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 5190 + }, + { + "epoch": 3.7640246109301483, + "grad_norm": 0.6717280745506287, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 5200 + }, + { + "epoch": 3.7712631197973217, + "grad_norm": 0.7754665613174438, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5210 + }, + { + "epoch": 3.778501628664495, + "grad_norm": 1.028374433517456, + "learning_rate": 0.0002, + "loss": 1.305, + "step": 5220 + }, + { + "epoch": 3.7857401375316684, + "grad_norm": 0.6026996374130249, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5230 + }, + { + "epoch": 3.7929786463988417, + "grad_norm": 0.6978490948677063, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 5240 + }, + { + "epoch": 3.800217155266015, + "grad_norm": 0.7303446531295776, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 5250 + }, + { + "epoch": 3.8074556641331885, + "grad_norm": 1.0734210014343262, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 5260 + }, + { + "epoch": 3.814694173000362, + "grad_norm": 0.6383201479911804, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 5270 + }, + { + "epoch": 3.821932681867535, + "grad_norm": 0.7742630243301392, + "learning_rate": 0.0002, + "loss": 1.0904, + "step": 5280 + }, + { + "epoch": 3.8291711907347086, + "grad_norm": 0.8477074503898621, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 5290 + }, + { + "epoch": 3.836409699601882, + "grad_norm": 0.6675317883491516, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 5300 + }, + { + "epoch": 3.8436482084690553, + "grad_norm": 0.7515445351600647, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 5310 + }, + { + "epoch": 3.8508867173362287, + "grad_norm": 1.1441220045089722, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 5320 + }, + { + "epoch": 3.858125226203402, + "grad_norm": 0.7968795895576477, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 5330 + }, + { + "epoch": 3.8653637350705754, + "grad_norm": 0.7842824459075928, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 5340 + }, + { + "epoch": 3.8726022439377488, + "grad_norm": 0.8272225260734558, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5350 + }, + { + "epoch": 3.879840752804922, + "grad_norm": 0.8413397669792175, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 5360 + }, + { + "epoch": 3.8870792616720955, + "grad_norm": 1.141764760017395, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5370 + }, + { + "epoch": 3.894317770539269, + "grad_norm": 0.9826975464820862, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5380 + }, + { + "epoch": 3.9015562794064422, + "grad_norm": 0.8598255515098572, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5390 + }, + { + "epoch": 3.9087947882736156, + "grad_norm": 0.6271058320999146, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 5400 + }, + { + "epoch": 3.916033297140789, + "grad_norm": 0.6379870772361755, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5410 + }, + { + "epoch": 3.9232718060079623, + "grad_norm": 1.0313376188278198, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 5420 + }, + { + "epoch": 3.9305103148751357, + "grad_norm": 0.8220619559288025, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 5430 + }, + { + "epoch": 3.937748823742309, + "grad_norm": 0.7576116919517517, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5440 + }, + { + "epoch": 3.9449873326094824, + "grad_norm": 1.226235032081604, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 5450 + }, + { + "epoch": 3.952225841476656, + "grad_norm": 0.7979229688644409, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5460 + }, + { + "epoch": 3.959464350343829, + "grad_norm": 0.9911929965019226, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 5470 + }, + { + "epoch": 3.9667028592110025, + "grad_norm": 0.643738865852356, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5480 + }, + { + "epoch": 3.973941368078176, + "grad_norm": 0.682305634021759, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 5490 + }, + { + "epoch": 3.9811798769453492, + "grad_norm": 1.18373441696167, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 5500 + }, + { + "epoch": 3.9884183858125226, + "grad_norm": 0.7190203070640564, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5510 + }, + { + "epoch": 3.995656894679696, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 5520 + }, + { + "epoch": 4.0, + "eval_loss": 1.4252897500991821, + "eval_runtime": 27.235, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 5526 + }, + { + "epoch": 4.002895403546869, + "grad_norm": 0.6353074312210083, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 5530 + }, + { + "epoch": 4.010133912414043, + "grad_norm": 0.7424906492233276, + "learning_rate": 0.0002, + "loss": 1.0326, + "step": 5540 + }, + { + "epoch": 4.017372421281216, + "grad_norm": 0.8856638073921204, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5550 + }, + { + "epoch": 4.024610930148389, + "grad_norm": 0.9627974033355713, + "learning_rate": 0.0002, + "loss": 1.0905, + "step": 5560 + }, + { + "epoch": 4.031849439015563, + "grad_norm": 0.9048978686332703, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5570 + }, + { + "epoch": 4.039087947882736, + "grad_norm": 0.921119213104248, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 5580 + }, + { + "epoch": 4.0463264567499095, + "grad_norm": 0.8654361963272095, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 5590 + }, + { + "epoch": 4.053564965617083, + "grad_norm": 0.7947945594787598, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 5600 + }, + { + "epoch": 4.060803474484256, + "grad_norm": 0.8307326436042786, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 5610 + }, + { + "epoch": 4.06804198335143, + "grad_norm": 0.793273389339447, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 5620 + }, + { + "epoch": 4.075280492218603, + "grad_norm": 0.8748673796653748, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.082519001085776, + "grad_norm": 0.7926856279373169, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 5640 + }, + { + "epoch": 4.08975750995295, + "grad_norm": 0.922645092010498, + "learning_rate": 0.0002, + "loss": 1.044, + "step": 5650 + }, + { + "epoch": 4.096996018820123, + "grad_norm": 0.9539641737937927, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5660 + }, + { + "epoch": 4.1042345276872965, + "grad_norm": 0.8674443364143372, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 5670 + }, + { + "epoch": 4.11147303655447, + "grad_norm": 0.7097609043121338, + "learning_rate": 0.0002, + "loss": 0.9867, + "step": 5680 + }, + { + "epoch": 4.118711545421643, + "grad_norm": 0.8875522613525391, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 5690 + }, + { + "epoch": 4.125950054288817, + "grad_norm": 0.8583634495735168, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 5700 + }, + { + "epoch": 4.13318856315599, + "grad_norm": 0.6736377477645874, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 5710 + }, + { + "epoch": 4.140427072023163, + "grad_norm": 0.9349062442779541, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 5720 + }, + { + "epoch": 4.147665580890337, + "grad_norm": 1.0610365867614746, + "learning_rate": 0.0002, + "loss": 1.0508, + "step": 5730 + }, + { + "epoch": 4.15490408975751, + "grad_norm": 1.5838189125061035, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 5740 + }, + { + "epoch": 4.162142598624683, + "grad_norm": 0.747522234916687, + "learning_rate": 0.0002, + "loss": 1.0222, + "step": 5750 + }, + { + "epoch": 4.169381107491857, + "grad_norm": 1.3247915506362915, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 5760 + }, + { + "epoch": 4.17661961635903, + "grad_norm": 0.8750247955322266, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 5770 + }, + { + "epoch": 4.1838581252262035, + "grad_norm": 0.7914144992828369, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 5780 + }, + { + "epoch": 4.191096634093377, + "grad_norm": 0.9493299126625061, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 5790 + }, + { + "epoch": 4.19833514296055, + "grad_norm": 0.7802295088768005, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 5800 + }, + { + "epoch": 4.205573651827724, + "grad_norm": 0.6987314820289612, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 5810 + }, + { + "epoch": 4.212812160694897, + "grad_norm": 0.9220341444015503, + "learning_rate": 0.0002, + "loss": 1.1699, + "step": 5820 + }, + { + "epoch": 4.22005066956207, + "grad_norm": 0.8932939767837524, + "learning_rate": 0.0002, + "loss": 1.1394, + "step": 5830 + }, + { + "epoch": 4.227289178429244, + "grad_norm": 0.920002818107605, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 5840 + }, + { + "epoch": 4.234527687296417, + "grad_norm": 0.6662752032279968, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 5850 + }, + { + "epoch": 4.24176619616359, + "grad_norm": 0.8679718971252441, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 5860 + }, + { + "epoch": 4.249004705030764, + "grad_norm": 0.7020887732505798, + "learning_rate": 0.0002, + "loss": 0.8991, + "step": 5870 + }, + { + "epoch": 4.256243213897937, + "grad_norm": 0.869611382484436, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 5880 + }, + { + "epoch": 4.2634817227651105, + "grad_norm": 0.7796585559844971, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 5890 + }, + { + "epoch": 4.270720231632284, + "grad_norm": 0.8978819251060486, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 5900 + }, + { + "epoch": 4.277958740499457, + "grad_norm": 1.0837205648422241, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 5910 + }, + { + "epoch": 4.285197249366631, + "grad_norm": 0.7584353089332581, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5920 + }, + { + "epoch": 4.292435758233804, + "grad_norm": 0.7313185334205627, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5930 + }, + { + "epoch": 4.299674267100977, + "grad_norm": 0.8004671335220337, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 5940 + }, + { + "epoch": 4.306912775968151, + "grad_norm": 2.154958724975586, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 5950 + }, + { + "epoch": 4.314151284835324, + "grad_norm": 0.9163479804992676, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 5960 + }, + { + "epoch": 4.321389793702497, + "grad_norm": 0.9151589274406433, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 5970 + }, + { + "epoch": 4.328628302569671, + "grad_norm": 0.8624112010002136, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 5980 + }, + { + "epoch": 4.335866811436844, + "grad_norm": 0.9357741475105286, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 5990 + }, + { + "epoch": 4.3431053203040175, + "grad_norm": 1.3482335805892944, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 6000 + }, + { + "epoch": 4.350343829171191, + "grad_norm": 0.7156149744987488, + "learning_rate": 0.0002, + "loss": 1.1224, + "step": 6010 + }, + { + "epoch": 4.357582338038364, + "grad_norm": 0.8480049967765808, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6020 + }, + { + "epoch": 4.364820846905538, + "grad_norm": 0.8262244462966919, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 6030 + }, + { + "epoch": 4.372059355772711, + "grad_norm": 0.7733905911445618, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 6040 + }, + { + "epoch": 4.379297864639884, + "grad_norm": 0.8553919792175293, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 6050 + }, + { + "epoch": 4.386536373507058, + "grad_norm": 0.8666832447052002, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 6060 + }, + { + "epoch": 4.393774882374231, + "grad_norm": 0.9168295860290527, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6070 + }, + { + "epoch": 4.4010133912414044, + "grad_norm": 0.7315238118171692, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 6080 + }, + { + "epoch": 4.408251900108578, + "grad_norm": 1.020263433456421, + "learning_rate": 0.0002, + "loss": 1.1599, + "step": 6090 + }, + { + "epoch": 4.415490408975751, + "grad_norm": 0.9978243708610535, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 6100 + }, + { + "epoch": 4.4227289178429245, + "grad_norm": 0.995453953742981, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 6110 + }, + { + "epoch": 4.429967426710098, + "grad_norm": 0.9360884428024292, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6120 + }, + { + "epoch": 4.437205935577271, + "grad_norm": 0.8099448084831238, + "learning_rate": 0.0002, + "loss": 0.9506, + "step": 6130 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8173841238021851, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 6140 + }, + { + "epoch": 4.451682953311618, + "grad_norm": 0.7972666025161743, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 6150 + }, + { + "epoch": 4.458921462178791, + "grad_norm": 0.7685779333114624, + "learning_rate": 0.0002, + "loss": 1.0226, + "step": 6160 + }, + { + "epoch": 4.466159971045965, + "grad_norm": 0.7872623801231384, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6170 + }, + { + "epoch": 4.473398479913138, + "grad_norm": 0.7677070498466492, + "learning_rate": 0.0002, + "loss": 0.9911, + "step": 6180 + }, + { + "epoch": 4.4806369887803115, + "grad_norm": 0.7878316044807434, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 6190 + }, + { + "epoch": 4.487875497647485, + "grad_norm": 0.8178079724311829, + "learning_rate": 0.0002, + "loss": 1.018, + "step": 6200 + }, + { + "epoch": 4.495114006514658, + "grad_norm": 1.2820082902908325, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 6210 + }, + { + "epoch": 4.502352515381832, + "grad_norm": 0.9380832314491272, + "learning_rate": 0.0002, + "loss": 1.3101, + "step": 6220 + }, + { + "epoch": 4.509591024249005, + "grad_norm": 0.7810422778129578, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 6230 + }, + { + "epoch": 4.516829533116178, + "grad_norm": 1.1022917032241821, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6240 + }, + { + "epoch": 4.524068041983352, + "grad_norm": 1.4275553226470947, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 6250 + }, + { + "epoch": 4.531306550850525, + "grad_norm": 0.7597777247428894, + "learning_rate": 0.0002, + "loss": 1.3237, + "step": 6260 + }, + { + "epoch": 4.538545059717698, + "grad_norm": 1.10992431640625, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 6270 + }, + { + "epoch": 4.545783568584872, + "grad_norm": 0.8981178998947144, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6280 + }, + { + "epoch": 4.553022077452045, + "grad_norm": 0.7863979339599609, + "learning_rate": 0.0002, + "loss": 1.086, + "step": 6290 + }, + { + "epoch": 4.5602605863192185, + "grad_norm": 0.9071474671363831, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 6300 + }, + { + "epoch": 4.567499095186392, + "grad_norm": 0.7429424524307251, + "learning_rate": 0.0002, + "loss": 1.0916, + "step": 6310 + }, + { + "epoch": 4.574737604053565, + "grad_norm": 1.0767850875854492, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 6320 + }, + { + "epoch": 4.581976112920739, + "grad_norm": 0.7885915637016296, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 6330 + }, + { + "epoch": 4.589214621787912, + "grad_norm": 0.8350457549095154, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 6340 + }, + { + "epoch": 4.596453130655085, + "grad_norm": 0.7853530645370483, + "learning_rate": 0.0002, + "loss": 1.0743, + "step": 6350 + }, + { + "epoch": 4.603691639522259, + "grad_norm": 1.1220661401748657, + "learning_rate": 0.0002, + "loss": 1.1912, + "step": 6360 + }, + { + "epoch": 4.610930148389432, + "grad_norm": 0.7959423065185547, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 6370 + }, + { + "epoch": 4.618168657256605, + "grad_norm": 0.7782652378082275, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6380 + }, + { + "epoch": 4.625407166123779, + "grad_norm": 0.7882203459739685, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6390 + }, + { + "epoch": 4.632645674990952, + "grad_norm": 0.8841899037361145, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 6400 + }, + { + "epoch": 4.6398841838581255, + "grad_norm": 0.7936127781867981, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 6410 + }, + { + "epoch": 4.647122692725299, + "grad_norm": 0.9213966131210327, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 6420 + }, + { + "epoch": 4.654361201592472, + "grad_norm": 0.9246473908424377, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 6430 + }, + { + "epoch": 4.661599710459646, + "grad_norm": 0.766572892665863, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 6440 + }, + { + "epoch": 4.668838219326819, + "grad_norm": 0.8596171736717224, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 6450 + }, + { + "epoch": 4.676076728193992, + "grad_norm": 0.8482751846313477, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6460 + }, + { + "epoch": 4.683315237061166, + "grad_norm": 1.0826905965805054, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 6470 + }, + { + "epoch": 4.690553745928339, + "grad_norm": 1.1048457622528076, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 6480 + }, + { + "epoch": 4.697792254795512, + "grad_norm": 0.9429134726524353, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6490 + }, + { + "epoch": 4.705030763662686, + "grad_norm": 0.8587502837181091, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 6500 + }, + { + "epoch": 4.712269272529859, + "grad_norm": 1.0387083292007446, + "learning_rate": 0.0002, + "loss": 1.0969, + "step": 6510 + }, + { + "epoch": 4.7195077813970325, + "grad_norm": 0.7471951842308044, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 6520 + }, + { + "epoch": 4.726746290264206, + "grad_norm": 0.8800424933433533, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6530 + }, + { + "epoch": 4.733984799131379, + "grad_norm": 0.8136811852455139, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 6540 + }, + { + "epoch": 4.741223307998553, + "grad_norm": 0.9910339713096619, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 6550 + }, + { + "epoch": 4.748461816865726, + "grad_norm": 1.0679163932800293, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 6560 + }, + { + "epoch": 4.755700325732899, + "grad_norm": 0.8468248248100281, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 6570 + }, + { + "epoch": 4.762938834600073, + "grad_norm": 0.8771235942840576, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 6580 + }, + { + "epoch": 4.770177343467246, + "grad_norm": 0.7024846076965332, + "learning_rate": 0.0002, + "loss": 1.077, + "step": 6590 + }, + { + "epoch": 4.7774158523344195, + "grad_norm": 0.7836683392524719, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 6600 + }, + { + "epoch": 4.784654361201593, + "grad_norm": 0.7717288136482239, + "learning_rate": 0.0002, + "loss": 1.1006, + "step": 6610 + }, + { + "epoch": 4.791892870068766, + "grad_norm": 0.884183943271637, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 6620 + }, + { + "epoch": 4.7991313789359396, + "grad_norm": 1.383867621421814, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 6630 + }, + { + "epoch": 4.806369887803113, + "grad_norm": 0.9741523861885071, + "learning_rate": 0.0002, + "loss": 1.0861, + "step": 6640 + }, + { + "epoch": 4.813608396670286, + "grad_norm": 0.9723693132400513, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 6650 + }, + { + "epoch": 4.82084690553746, + "grad_norm": 1.8324809074401855, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 6660 + }, + { + "epoch": 4.828085414404633, + "grad_norm": 0.904909074306488, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 6670 + }, + { + "epoch": 4.835323923271806, + "grad_norm": 0.7355411648750305, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 6680 + }, + { + "epoch": 4.84256243213898, + "grad_norm": 0.8934960961341858, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 6690 + }, + { + "epoch": 4.849800941006153, + "grad_norm": 1.4596954584121704, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6700 + }, + { + "epoch": 4.8570394498733265, + "grad_norm": 0.8310341238975525, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 6710 + }, + { + "epoch": 4.8642779587405, + "grad_norm": 0.9709894061088562, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 6720 + }, + { + "epoch": 4.871516467607673, + "grad_norm": 0.852142333984375, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 6730 + }, + { + "epoch": 4.878754976474847, + "grad_norm": 1.0643625259399414, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 6740 + }, + { + "epoch": 4.88599348534202, + "grad_norm": 0.9419508576393127, + "learning_rate": 0.0002, + "loss": 1.056, + "step": 6750 + }, + { + "epoch": 4.893231994209193, + "grad_norm": 1.1818498373031616, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 6760 + }, + { + "epoch": 4.900470503076367, + "grad_norm": 0.9369569420814514, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 6770 + }, + { + "epoch": 4.90770901194354, + "grad_norm": 0.7012579441070557, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 6780 + }, + { + "epoch": 4.914947520810713, + "grad_norm": 0.9109319448471069, + "learning_rate": 0.0002, + "loss": 1.0926, + "step": 6790 + }, + { + "epoch": 4.922186029677887, + "grad_norm": 0.8077534437179565, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 6800 + }, + { + "epoch": 4.92942453854506, + "grad_norm": 0.7571148872375488, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 6810 + }, + { + "epoch": 4.9366630474122335, + "grad_norm": 0.7325633764266968, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 6820 + }, + { + "epoch": 4.943901556279407, + "grad_norm": 0.8465084433555603, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 6830 + }, + { + "epoch": 4.95114006514658, + "grad_norm": 0.8753737807273865, + "learning_rate": 0.0002, + "loss": 1.153, + "step": 6840 + }, + { + "epoch": 4.958378574013754, + "grad_norm": 0.9421748518943787, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 6850 + }, + { + "epoch": 4.965617082880927, + "grad_norm": 0.8245896697044373, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 6860 + }, + { + "epoch": 4.9728555917481, + "grad_norm": 0.8823089599609375, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 6870 + }, + { + "epoch": 4.980094100615274, + "grad_norm": 0.8406389355659485, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 6880 + }, + { + "epoch": 4.987332609482447, + "grad_norm": 0.9732868075370789, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 6890 + }, + { + "epoch": 4.99457111834962, + "grad_norm": 2.125141143798828, + "learning_rate": 0.0002, + "loss": 1.1776, + "step": 6900 + }, + { + "epoch": 4.999638074556641, + "eval_loss": 1.445176601409912, + "eval_runtime": 27.2351, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 6907 + }, + { + "epoch": 5.001809627216793, + "grad_norm": 0.9465792775154114, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 6910 + }, + { + "epoch": 5.009048136083966, + "grad_norm": 1.2834891080856323, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 6920 + }, + { + "epoch": 5.01628664495114, + "grad_norm": 1.0297378301620483, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 6930 + }, + { + "epoch": 5.023525153818313, + "grad_norm": 1.1705161333084106, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 6940 + }, + { + "epoch": 5.030763662685486, + "grad_norm": 0.8293961882591248, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 6950 + }, + { + "epoch": 5.03800217155266, + "grad_norm": 1.0422210693359375, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 6960 + }, + { + "epoch": 5.045240680419833, + "grad_norm": 1.116104245185852, + "learning_rate": 0.0002, + "loss": 1.0553, + "step": 6970 + }, + { + "epoch": 5.0524791892870065, + "grad_norm": 1.5118416547775269, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 6980 + }, + { + "epoch": 5.05971769815418, + "grad_norm": 0.8383979797363281, + "learning_rate": 0.0002, + "loss": 0.9969, + "step": 6990 + }, + { + "epoch": 5.066956207021353, + "grad_norm": 1.3378649950027466, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7000 + }, + { + "epoch": 5.0741947158885266, + "grad_norm": 1.1840510368347168, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 7010 + }, + { + "epoch": 5.0814332247557, + "grad_norm": 1.2354751825332642, + "learning_rate": 0.0002, + "loss": 0.9939, + "step": 7020 + }, + { + "epoch": 5.088671733622873, + "grad_norm": 1.3830451965332031, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 7030 + }, + { + "epoch": 5.095910242490047, + "grad_norm": 0.8101674318313599, + "learning_rate": 0.0002, + "loss": 1.1827, + "step": 7040 + }, + { + "epoch": 5.10314875135722, + "grad_norm": 0.897982656955719, + "learning_rate": 0.0002, + "loss": 0.9255, + "step": 7050 + }, + { + "epoch": 5.110387260224393, + "grad_norm": 1.2049678564071655, + "learning_rate": 0.0002, + "loss": 0.8784, + "step": 7060 + }, + { + "epoch": 5.117625769091567, + "grad_norm": 1.5912116765975952, + "learning_rate": 0.0002, + "loss": 1.0182, + "step": 7070 + }, + { + "epoch": 5.12486427795874, + "grad_norm": 0.9261530041694641, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 7080 + }, + { + "epoch": 5.1321027868259135, + "grad_norm": 1.1454812288284302, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 7090 + }, + { + "epoch": 5.139341295693087, + "grad_norm": 1.0049978494644165, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 7100 + }, + { + "epoch": 5.14657980456026, + "grad_norm": 1.4513251781463623, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 7110 + }, + { + "epoch": 5.153818313427434, + "grad_norm": 0.9800849556922913, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 7120 + }, + { + "epoch": 5.161056822294607, + "grad_norm": 0.9698708653450012, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 7130 + }, + { + "epoch": 5.16829533116178, + "grad_norm": 1.1126646995544434, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 7140 + }, + { + "epoch": 5.175533840028954, + "grad_norm": 0.9248330593109131, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 7150 + }, + { + "epoch": 5.182772348896127, + "grad_norm": 0.7967255711555481, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 7160 + }, + { + "epoch": 5.1900108577633, + "grad_norm": 0.9933333992958069, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 7170 + }, + { + "epoch": 5.197249366630474, + "grad_norm": 1.0080649852752686, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7180 + }, + { + "epoch": 5.204487875497647, + "grad_norm": 1.3954921960830688, + "learning_rate": 0.0002, + "loss": 1.0201, + "step": 7190 + }, + { + "epoch": 5.2117263843648205, + "grad_norm": 1.2386271953582764, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 7200 + }, + { + "epoch": 5.218964893231994, + "grad_norm": 1.2379488945007324, + "learning_rate": 0.0002, + "loss": 0.8863, + "step": 7210 + }, + { + "epoch": 5.226203402099167, + "grad_norm": 0.9882503747940063, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 7220 + }, + { + "epoch": 5.233441910966341, + "grad_norm": 1.1728729009628296, + "learning_rate": 0.0002, + "loss": 0.9834, + "step": 7230 + }, + { + "epoch": 5.240680419833514, + "grad_norm": 0.9849673509597778, + "learning_rate": 0.0002, + "loss": 0.9269, + "step": 7240 + }, + { + "epoch": 5.247918928700687, + "grad_norm": 1.177639365196228, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 7250 + }, + { + "epoch": 5.255157437567861, + "grad_norm": 1.2395055294036865, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 7260 + }, + { + "epoch": 5.262395946435034, + "grad_norm": 1.3999171257019043, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 7270 + }, + { + "epoch": 5.269634455302207, + "grad_norm": 0.7698732018470764, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 7280 + }, + { + "epoch": 5.276872964169381, + "grad_norm": 0.9167453646659851, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 7290 + }, + { + "epoch": 5.284111473036554, + "grad_norm": 1.113830804824829, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 7300 + }, + { + "epoch": 5.2913499819037275, + "grad_norm": 0.9644396901130676, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 7310 + }, + { + "epoch": 5.298588490770901, + "grad_norm": 1.462435007095337, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7320 + }, + { + "epoch": 5.305826999638074, + "grad_norm": 0.9406287670135498, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 7330 + }, + { + "epoch": 5.313065508505248, + "grad_norm": 0.9698247909545898, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 7340 + }, + { + "epoch": 5.320304017372421, + "grad_norm": 1.12003755569458, + "learning_rate": 0.0002, + "loss": 0.915, + "step": 7350 + }, + { + "epoch": 5.327542526239594, + "grad_norm": 1.598681926727295, + "learning_rate": 0.0002, + "loss": 0.9838, + "step": 7360 + }, + { + "epoch": 5.334781035106768, + "grad_norm": 1.0450010299682617, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 7370 + }, + { + "epoch": 5.342019543973941, + "grad_norm": 0.8680008053779602, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 7380 + }, + { + "epoch": 5.349258052841114, + "grad_norm": 1.0115476846694946, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 7390 + }, + { + "epoch": 5.356496561708288, + "grad_norm": 0.9589748382568359, + "learning_rate": 0.0002, + "loss": 1.0702, + "step": 7400 + }, + { + "epoch": 5.363735070575461, + "grad_norm": 0.6729998588562012, + "learning_rate": 0.0002, + "loss": 0.9366, + "step": 7410 + }, + { + "epoch": 5.3709735794426345, + "grad_norm": 0.9246699213981628, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 7420 + }, + { + "epoch": 5.378212088309808, + "grad_norm": 1.1266791820526123, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 7430 + }, + { + "epoch": 5.385450597176981, + "grad_norm": 1.8056942224502563, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 7440 + }, + { + "epoch": 5.392689106044155, + "grad_norm": 0.9802932739257812, + "learning_rate": 0.0002, + "loss": 0.9604, + "step": 7450 + }, + { + "epoch": 5.399927614911328, + "grad_norm": 1.0504707098007202, + "learning_rate": 0.0002, + "loss": 0.9656, + "step": 7460 + }, + { + "epoch": 5.407166123778501, + "grad_norm": 1.1915022134780884, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 7470 + }, + { + "epoch": 5.414404632645675, + "grad_norm": 1.1856611967086792, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 7480 + }, + { + "epoch": 5.421643141512848, + "grad_norm": 1.292152762413025, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 7490 + }, + { + "epoch": 5.4288816503800215, + "grad_norm": 1.2675740718841553, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7500 + }, + { + "epoch": 5.436120159247195, + "grad_norm": 1.4034695625305176, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7510 + }, + { + "epoch": 5.443358668114368, + "grad_norm": 0.984588623046875, + "learning_rate": 0.0002, + "loss": 1.0318, + "step": 7520 + }, + { + "epoch": 5.450597176981542, + "grad_norm": 0.8419108390808105, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 7530 + }, + { + "epoch": 5.457835685848715, + "grad_norm": 1.0270143747329712, + "learning_rate": 0.0002, + "loss": 1.0499, + "step": 7540 + }, + { + "epoch": 5.465074194715888, + "grad_norm": 2.2158689498901367, + "learning_rate": 0.0002, + "loss": 0.9804, + "step": 7550 + }, + { + "epoch": 5.472312703583062, + "grad_norm": 1.0740524530410767, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 7560 + }, + { + "epoch": 5.479551212450235, + "grad_norm": 1.3804482221603394, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7570 + }, + { + "epoch": 5.486789721317408, + "grad_norm": 0.9428979754447937, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 7580 + }, + { + "epoch": 5.494028230184582, + "grad_norm": 0.9548295736312866, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7590 + }, + { + "epoch": 5.501266739051755, + "grad_norm": 1.0691065788269043, + "learning_rate": 0.0002, + "loss": 0.8853, + "step": 7600 + }, + { + "epoch": 5.5085052479189285, + "grad_norm": 1.0987380743026733, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 7610 + }, + { + "epoch": 5.515743756786102, + "grad_norm": 0.9483979344367981, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 7620 + }, + { + "epoch": 5.522982265653275, + "grad_norm": 1.16624915599823, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 7630 + }, + { + "epoch": 5.530220774520449, + "grad_norm": 0.8563777208328247, + "learning_rate": 0.0002, + "loss": 0.8695, + "step": 7640 + }, + { + "epoch": 5.537459283387622, + "grad_norm": 1.268186092376709, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 7650 + }, + { + "epoch": 5.544697792254795, + "grad_norm": 1.0752092599868774, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 7660 + }, + { + "epoch": 5.551936301121969, + "grad_norm": 1.210389256477356, + "learning_rate": 0.0002, + "loss": 0.9344, + "step": 7670 + }, + { + "epoch": 5.559174809989142, + "grad_norm": 1.669063925743103, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 7680 + }, + { + "epoch": 5.566413318856315, + "grad_norm": 1.038020133972168, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 7690 + }, + { + "epoch": 5.573651827723489, + "grad_norm": 1.316673994064331, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 7700 + }, + { + "epoch": 5.580890336590662, + "grad_norm": 1.029935359954834, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 7710 + }, + { + "epoch": 5.5881288454578355, + "grad_norm": 0.9401940703392029, + "learning_rate": 0.0002, + "loss": 1.0409, + "step": 7720 + }, + { + "epoch": 5.595367354325009, + "grad_norm": 2.4811816215515137, + "learning_rate": 0.0002, + "loss": 0.9272, + "step": 7730 + }, + { + "epoch": 5.602605863192182, + "grad_norm": 1.0329105854034424, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 7740 + }, + { + "epoch": 5.609844372059356, + "grad_norm": 1.479629635810852, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 7750 + }, + { + "epoch": 5.617082880926529, + "grad_norm": 1.9232319593429565, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 7760 + }, + { + "epoch": 5.624321389793702, + "grad_norm": 1.0055509805679321, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7770 + }, + { + "epoch": 5.631559898660876, + "grad_norm": 1.0037437677383423, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 7780 + }, + { + "epoch": 5.638798407528049, + "grad_norm": 1.4245030879974365, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 7790 + }, + { + "epoch": 5.646036916395222, + "grad_norm": 1.080687403678894, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 7800 + }, + { + "epoch": 5.653275425262396, + "grad_norm": 1.354953408241272, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7810 + }, + { + "epoch": 5.660513934129569, + "grad_norm": 0.8966761231422424, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7820 + }, + { + "epoch": 5.6677524429967425, + "grad_norm": 1.0675480365753174, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7830 + }, + { + "epoch": 5.674990951863916, + "grad_norm": 1.2104216814041138, + "learning_rate": 0.0002, + "loss": 1.1077, + "step": 7840 + }, + { + "epoch": 5.682229460731089, + "grad_norm": 1.105790376663208, + "learning_rate": 0.0002, + "loss": 0.9627, + "step": 7850 + }, + { + "epoch": 5.689467969598263, + "grad_norm": 1.0915391445159912, + "learning_rate": 0.0002, + "loss": 1.0483, + "step": 7860 + }, + { + "epoch": 5.696706478465436, + "grad_norm": 0.8957812786102295, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 7870 + }, + { + "epoch": 5.703944987332609, + "grad_norm": 1.9189311265945435, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 7880 + }, + { + "epoch": 5.711183496199783, + "grad_norm": 1.0867321491241455, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 7890 + }, + { + "epoch": 5.718422005066956, + "grad_norm": 1.0233147144317627, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 7900 + }, + { + "epoch": 5.7256605139341294, + "grad_norm": 1.16460382938385, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 7910 + }, + { + "epoch": 5.732899022801303, + "grad_norm": 1.1098358631134033, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 7920 + }, + { + "epoch": 5.740137531668476, + "grad_norm": 0.8555701375007629, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 7930 + }, + { + "epoch": 5.7473760405356495, + "grad_norm": 0.9885705709457397, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 7940 + }, + { + "epoch": 5.754614549402823, + "grad_norm": 0.9184203147888184, + "learning_rate": 0.0002, + "loss": 0.9909, + "step": 7950 + }, + { + "epoch": 5.761853058269996, + "grad_norm": 0.9653698205947876, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7960 + }, + { + "epoch": 5.76909156713717, + "grad_norm": 1.0014251470565796, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 7970 + }, + { + "epoch": 5.776330076004343, + "grad_norm": 1.004701018333435, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7980 + }, + { + "epoch": 5.783568584871516, + "grad_norm": 0.950577974319458, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7990 + }, + { + "epoch": 5.79080709373869, + "grad_norm": 1.2986834049224854, + "learning_rate": 0.0002, + "loss": 0.9725, + "step": 8000 + }, + { + "epoch": 5.798045602605863, + "grad_norm": 1.3353424072265625, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 8010 + }, + { + "epoch": 5.8052841114730365, + "grad_norm": 0.7650562524795532, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 8020 + }, + { + "epoch": 5.81252262034021, + "grad_norm": 1.0156235694885254, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 8030 + }, + { + "epoch": 5.819761129207383, + "grad_norm": 1.3092900514602661, + "learning_rate": 0.0002, + "loss": 1.0185, + "step": 8040 + }, + { + "epoch": 5.826999638074557, + "grad_norm": 1.184428095817566, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 8050 + }, + { + "epoch": 5.83423814694173, + "grad_norm": 0.979401707649231, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 8060 + }, + { + "epoch": 5.841476655808903, + "grad_norm": 1.3557400703430176, + "learning_rate": 0.0002, + "loss": 0.9721, + "step": 8070 + }, + { + "epoch": 5.848715164676077, + "grad_norm": 0.8429333567619324, + "learning_rate": 0.0002, + "loss": 1.0235, + "step": 8080 + }, + { + "epoch": 5.85595367354325, + "grad_norm": 1.3167692422866821, + "learning_rate": 0.0002, + "loss": 0.952, + "step": 8090 + }, + { + "epoch": 5.863192182410423, + "grad_norm": 0.9750998020172119, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 8100 + }, + { + "epoch": 5.870430691277597, + "grad_norm": 1.1869813203811646, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 8110 + }, + { + "epoch": 5.87766920014477, + "grad_norm": 1.508615255355835, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 8120 + }, + { + "epoch": 5.8849077090119435, + "grad_norm": 0.9439908266067505, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 8130 + }, + { + "epoch": 5.892146217879117, + "grad_norm": 0.910508930683136, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 8140 + }, + { + "epoch": 5.89938472674629, + "grad_norm": 1.111501932144165, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 8150 + }, + { + "epoch": 5.906623235613464, + "grad_norm": 0.726554274559021, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 8160 + }, + { + "epoch": 5.913861744480637, + "grad_norm": 1.1084556579589844, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 8170 + }, + { + "epoch": 5.92110025334781, + "grad_norm": 0.9695167541503906, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 8180 + }, + { + "epoch": 5.928338762214984, + "grad_norm": 1.1169592142105103, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 8190 + }, + { + "epoch": 5.935577271082157, + "grad_norm": 1.5116780996322632, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 8200 + }, + { + "epoch": 5.94281577994933, + "grad_norm": 1.0073388814926147, + "learning_rate": 0.0002, + "loss": 0.878, + "step": 8210 + }, + { + "epoch": 5.950054288816504, + "grad_norm": 0.9323263168334961, + "learning_rate": 0.0002, + "loss": 1.0462, + "step": 8220 + }, + { + "epoch": 5.957292797683677, + "grad_norm": 0.9422887563705444, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 8230 + }, + { + "epoch": 5.9645313065508505, + "grad_norm": 0.9691047668457031, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 8240 + }, + { + "epoch": 5.971769815418024, + "grad_norm": 0.9650622606277466, + "learning_rate": 0.0002, + "loss": 0.9842, + "step": 8250 + }, + { + "epoch": 5.979008324285197, + "grad_norm": 1.077958345413208, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 8260 + }, + { + "epoch": 5.986246833152371, + "grad_norm": 0.8946306109428406, + "learning_rate": 0.0002, + "loss": 0.9162, + "step": 8270 + }, + { + "epoch": 5.993485342019544, + "grad_norm": 1.34098219871521, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 8280 + }, + { + "epoch": 6.0, + "eval_loss": 1.4714229106903076, + "eval_runtime": 26.301, + "eval_samples_per_second": 16.577, + "eval_steps_per_second": 2.091, + "step": 8289 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0130812762167706e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/README.md b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e99bbcd43df1c19d98706c7e3be95c93844c5349 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_model.safetensors b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5927413cd6210d26978b09f0d6db928597d370cb --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d325e62cb89159e996e41924b13e86460baf25ad8e6335045fc107f85fb39b +size 29500848 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/optimizer.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..966f2f94ca868ec2c430c250f47949da6960a17a --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03eb9b89691ccce742b81163a460675912dbecdb45aac3bcb0b7f01e7487bc8e +size 15064314 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/rng_state.pth b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd4137551d3fee184cc9b694c303a190f5778edd --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86980a9d744b0b83618f85562e4bb02bf4e224ee1e6dd81b109fdc19b8619627 +size 14244 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/scheduler.pt b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c74709cc1d4b3eefc8991b311b4299b91d52f6b --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eef3501f6ad2d312c19551b58b1762e6f84df60507cfb6d4e18ac4e3ca5b0721 +size 1064 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/trainer_state.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..28a8418758ac0c73331f559afa1f8718048c16fe --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/trainer_state.json @@ -0,0 +1,6858 @@ +{ + "best_metric": 1.4217946529388428, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", + "epoch": 6.999638074556641, + "eval_steps": 10, + "global_step": 9670, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007238508867173362, + "grad_norm": 1.2523442506790161, + "learning_rate": 0.0002, + "loss": 4.7061, + "step": 10 + }, + { + "epoch": 0.014477017734346724, + "grad_norm": 1.8887330293655396, + "learning_rate": 0.0002, + "loss": 3.3493, + "step": 20 + }, + { + "epoch": 0.021715526601520086, + "grad_norm": 0.9668035507202148, + "learning_rate": 0.0002, + "loss": 2.7585, + "step": 30 + }, + { + "epoch": 0.028954035468693448, + "grad_norm": 2.9167306423187256, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 40 + }, + { + "epoch": 0.036192544335866814, + "grad_norm": 2.649867296218872, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 50 + }, + { + "epoch": 0.04343105320304017, + "grad_norm": 1.5120655298233032, + "learning_rate": 0.0002, + "loss": 2.2202, + "step": 60 + }, + { + "epoch": 0.05066956207021354, + "grad_norm": 0.7879868149757385, + "learning_rate": 0.0002, + "loss": 2.2026, + "step": 70 + }, + { + "epoch": 0.057908070937386896, + "grad_norm": 0.7616953253746033, + "learning_rate": 0.0002, + "loss": 1.9447, + "step": 80 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 1.8809149265289307, + "learning_rate": 0.0002, + "loss": 2.0112, + "step": 90 + }, + { + "epoch": 0.07238508867173363, + "grad_norm": 0.9294016361236572, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 100 + }, + { + "epoch": 0.07962359753890698, + "grad_norm": 0.7145281434059143, + "learning_rate": 0.0002, + "loss": 1.8419, + "step": 110 + }, + { + "epoch": 0.08686210640608034, + "grad_norm": 0.7564446330070496, + "learning_rate": 0.0002, + "loss": 2.0036, + "step": 120 + }, + { + "epoch": 0.09410061527325371, + "grad_norm": 1.1681925058364868, + "learning_rate": 0.0002, + "loss": 1.9306, + "step": 130 + }, + { + "epoch": 0.10133912414042708, + "grad_norm": 0.6708641648292542, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 140 + }, + { + "epoch": 0.10857763300760044, + "grad_norm": 0.7625647783279419, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 150 + }, + { + "epoch": 0.11581614187477379, + "grad_norm": 0.8463464975357056, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 160 + }, + { + "epoch": 0.12305465074194716, + "grad_norm": 0.7502335906028748, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 0.6929958462715149, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 180 + }, + { + "epoch": 0.1375316684762939, + "grad_norm": 0.6798707842826843, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 190 + }, + { + "epoch": 0.14477017734346725, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 200 + }, + { + "epoch": 0.15200868621064062, + "grad_norm": 0.7196869850158691, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 210 + }, + { + "epoch": 0.15924719507781396, + "grad_norm": 0.8401045799255371, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 220 + }, + { + "epoch": 0.16648570394498732, + "grad_norm": 0.8503773212432861, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 230 + }, + { + "epoch": 0.1737242128121607, + "grad_norm": 0.7183733582496643, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 240 + }, + { + "epoch": 0.18096272167933405, + "grad_norm": 0.7082605957984924, + "learning_rate": 0.0002, + "loss": 1.6693, + "step": 250 + }, + { + "epoch": 0.18820123054650742, + "grad_norm": 0.9386326670646667, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 260 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 0.7332451939582825, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 270 + }, + { + "epoch": 0.20267824828085415, + "grad_norm": 0.7092869877815247, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 280 + }, + { + "epoch": 0.20991675714802752, + "grad_norm": 0.7256413698196411, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 290 + }, + { + "epoch": 0.21715526601520088, + "grad_norm": 0.6398681402206421, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 300 + }, + { + "epoch": 0.22439377488237422, + "grad_norm": 0.6273287534713745, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 310 + }, + { + "epoch": 0.23163228374954759, + "grad_norm": 0.511648416519165, + "learning_rate": 0.0002, + "loss": 1.5115, + "step": 320 + }, + { + "epoch": 0.23887079261672095, + "grad_norm": 0.8677352070808411, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 330 + }, + { + "epoch": 0.24610930148389432, + "grad_norm": 0.6270743012428284, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.2533478103510677, + "grad_norm": 0.7980281114578247, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 350 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 0.632486879825592, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 360 + }, + { + "epoch": 0.2678248280854144, + "grad_norm": 0.6527034640312195, + "learning_rate": 0.0002, + "loss": 1.5175, + "step": 370 + }, + { + "epoch": 0.2750633369525878, + "grad_norm": 0.7672118544578552, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 380 + }, + { + "epoch": 0.28230184581976114, + "grad_norm": 0.6035117506980896, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 390 + }, + { + "epoch": 0.2895403546869345, + "grad_norm": 0.5955103039741516, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 400 + }, + { + "epoch": 0.2967788635541079, + "grad_norm": 0.6015191674232483, + "learning_rate": 0.0002, + "loss": 1.558, + "step": 410 + }, + { + "epoch": 0.30401737242128124, + "grad_norm": 0.6380982398986816, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 420 + }, + { + "epoch": 0.3112558812884546, + "grad_norm": 0.6707863211631775, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 430 + }, + { + "epoch": 0.3184943901556279, + "grad_norm": 0.7010176777839661, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 440 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 0.8263739943504333, + "learning_rate": 0.0002, + "loss": 1.5572, + "step": 450 + }, + { + "epoch": 0.33297140788997465, + "grad_norm": 0.7253276109695435, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 460 + }, + { + "epoch": 0.340209916757148, + "grad_norm": 0.5238934755325317, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 470 + }, + { + "epoch": 0.3474484256243214, + "grad_norm": 0.7869495749473572, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 480 + }, + { + "epoch": 0.35468693449149474, + "grad_norm": 0.7485215663909912, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 490 + }, + { + "epoch": 0.3619254433586681, + "grad_norm": 0.5413193106651306, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 500 + }, + { + "epoch": 0.3691639522258415, + "grad_norm": 0.7615048885345459, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 510 + }, + { + "epoch": 0.37640246109301484, + "grad_norm": 0.7685340046882629, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 520 + }, + { + "epoch": 0.3836409699601882, + "grad_norm": 0.6379081010818481, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 530 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 540 + }, + { + "epoch": 0.39811798769453494, + "grad_norm": 0.6287278532981873, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 550 + }, + { + "epoch": 0.4053564965617083, + "grad_norm": 0.6811642646789551, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 560 + }, + { + "epoch": 0.41259500542888167, + "grad_norm": 0.671073317527771, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 570 + }, + { + "epoch": 0.41983351429605503, + "grad_norm": 0.6313900351524353, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 580 + }, + { + "epoch": 0.4270720231632284, + "grad_norm": 0.5291772484779358, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 590 + }, + { + "epoch": 0.43431053203040176, + "grad_norm": 0.62503582239151, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 600 + }, + { + "epoch": 0.4415490408975751, + "grad_norm": 0.5777305364608765, + "learning_rate": 0.0002, + "loss": 1.6276, + "step": 610 + }, + { + "epoch": 0.44878754976474844, + "grad_norm": 0.7013497352600098, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 620 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 0.8044822216033936, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 630 + }, + { + "epoch": 0.46326456749909517, + "grad_norm": 0.672531247138977, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 640 + }, + { + "epoch": 0.47050307636626854, + "grad_norm": 0.6233910322189331, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 650 + }, + { + "epoch": 0.4777415852334419, + "grad_norm": 0.651524543762207, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 660 + }, + { + "epoch": 0.48498009410061527, + "grad_norm": 0.7213939428329468, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 670 + }, + { + "epoch": 0.49221860296778863, + "grad_norm": 0.6541454792022705, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.499457111834962, + "grad_norm": 0.6568936109542847, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 690 + }, + { + "epoch": 0.5066956207021354, + "grad_norm": 0.7176415324211121, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 700 + }, + { + "epoch": 0.5139341295693087, + "grad_norm": 0.6553855538368225, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 710 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 720 + }, + { + "epoch": 0.5284111473036555, + "grad_norm": 0.5671001672744751, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 730 + }, + { + "epoch": 0.5356496561708288, + "grad_norm": 0.7914412021636963, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 740 + }, + { + "epoch": 0.5428881650380022, + "grad_norm": 0.6172138452529907, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 750 + }, + { + "epoch": 0.5501266739051756, + "grad_norm": 0.6132623553276062, + "learning_rate": 0.0002, + "loss": 1.4018, + "step": 760 + }, + { + "epoch": 0.5573651827723489, + "grad_norm": 0.654000461101532, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 770 + }, + { + "epoch": 0.5646036916395223, + "grad_norm": 0.5691370964050293, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 780 + }, + { + "epoch": 0.5718422005066957, + "grad_norm": 0.7922580242156982, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 790 + }, + { + "epoch": 0.579080709373869, + "grad_norm": 0.6831880211830139, + "learning_rate": 0.0002, + "loss": 1.4521, + "step": 800 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 0.6740124821662903, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 810 + }, + { + "epoch": 0.5935577271082157, + "grad_norm": 1.380016803741455, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 820 + }, + { + "epoch": 0.6007962359753891, + "grad_norm": 0.6552878022193909, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 830 + }, + { + "epoch": 0.6080347448425625, + "grad_norm": 0.6649535298347473, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 840 + }, + { + "epoch": 0.6152732537097358, + "grad_norm": 0.561738133430481, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 850 + }, + { + "epoch": 0.6225117625769092, + "grad_norm": 0.6133047938346863, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 860 + }, + { + "epoch": 0.6297502714440825, + "grad_norm": 0.559843122959137, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 870 + }, + { + "epoch": 0.6369887803112558, + "grad_norm": 0.6117811799049377, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 880 + }, + { + "epoch": 0.6442272891784292, + "grad_norm": 0.6209776401519775, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 890 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 0.6234082579612732, + "learning_rate": 0.0002, + "loss": 1.6747, + "step": 900 + }, + { + "epoch": 0.6587043069127759, + "grad_norm": 0.7623258233070374, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 910 + }, + { + "epoch": 0.6659428157799493, + "grad_norm": 0.6148061752319336, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 920 + }, + { + "epoch": 0.6731813246471227, + "grad_norm": 0.6682973504066467, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 930 + }, + { + "epoch": 0.680419833514296, + "grad_norm": 0.5513041615486145, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 940 + }, + { + "epoch": 0.6876583423814694, + "grad_norm": 0.5197525024414062, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 950 + }, + { + "epoch": 0.6948968512486428, + "grad_norm": 0.6490758061408997, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 960 + }, + { + "epoch": 0.7021353601158161, + "grad_norm": 0.6450682878494263, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 970 + }, + { + "epoch": 0.7093738689829895, + "grad_norm": 0.6203766465187073, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 980 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 0.6023609638214111, + "learning_rate": 0.0002, + "loss": 1.4575, + "step": 990 + }, + { + "epoch": 0.7238508867173362, + "grad_norm": 0.5765255093574524, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 1000 + }, + { + "epoch": 0.7310893955845096, + "grad_norm": 0.6650075316429138, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 1010 + }, + { + "epoch": 0.738327904451683, + "grad_norm": 0.5610854029655457, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 1020 + }, + { + "epoch": 0.7455664133188563, + "grad_norm": 0.7072813510894775, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 1030 + }, + { + "epoch": 0.7528049221860297, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 1040 + }, + { + "epoch": 0.760043431053203, + "grad_norm": 0.7932390570640564, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 1050 + }, + { + "epoch": 0.7672819399203764, + "grad_norm": 0.5798183083534241, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 1060 + }, + { + "epoch": 0.7745204487875498, + "grad_norm": 0.7898504137992859, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 1070 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 0.4983280301094055, + "learning_rate": 0.0002, + "loss": 1.4776, + "step": 1080 + }, + { + "epoch": 0.7889974665218965, + "grad_norm": 0.691403329372406, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 1090 + }, + { + "epoch": 0.7962359753890699, + "grad_norm": 0.5394481420516968, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 1100 + }, + { + "epoch": 0.8034744842562432, + "grad_norm": 0.5136822462081909, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 1110 + }, + { + "epoch": 0.8107129931234166, + "grad_norm": 0.6828126907348633, + "learning_rate": 0.0002, + "loss": 1.4902, + "step": 1120 + }, + { + "epoch": 0.81795150199059, + "grad_norm": 0.6799656748771667, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 1130 + }, + { + "epoch": 0.8251900108577633, + "grad_norm": 0.5428406000137329, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 1140 + }, + { + "epoch": 0.8324285197249367, + "grad_norm": 0.4811290502548218, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1150 + }, + { + "epoch": 0.8396670285921101, + "grad_norm": 0.5519434809684753, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 1160 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 0.9748060703277588, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 1170 + }, + { + "epoch": 0.8541440463264568, + "grad_norm": 0.712609589099884, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 1180 + }, + { + "epoch": 0.8613825551936302, + "grad_norm": 0.6866157054901123, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 1190 + }, + { + "epoch": 0.8686210640608035, + "grad_norm": 0.5068854093551636, + "learning_rate": 0.0002, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.8758595729279768, + "grad_norm": 0.6333245038986206, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.8830980817951501, + "grad_norm": 0.6424421072006226, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 1220 + }, + { + "epoch": 0.8903365906623235, + "grad_norm": 0.4771921932697296, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 1230 + }, + { + "epoch": 0.8975750995294969, + "grad_norm": 0.5191764235496521, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1240 + }, + { + "epoch": 0.9048136083966702, + "grad_norm": 0.756222128868103, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 1250 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 0.623823881149292, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 1260 + }, + { + "epoch": 0.919290626131017, + "grad_norm": 0.8166571259498596, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 1270 + }, + { + "epoch": 0.9265291349981903, + "grad_norm": 0.6059346795082092, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1280 + }, + { + "epoch": 0.9337676438653637, + "grad_norm": 0.5842690467834473, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 1290 + }, + { + "epoch": 0.9410061527325371, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1300 + }, + { + "epoch": 0.9482446615997104, + "grad_norm": 0.6420919895172119, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 1310 + }, + { + "epoch": 0.9554831704668838, + "grad_norm": 0.7011452913284302, + "learning_rate": 0.0002, + "loss": 1.453, + "step": 1320 + }, + { + "epoch": 0.9627216793340572, + "grad_norm": 0.5783746242523193, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1330 + }, + { + "epoch": 0.9699601882012305, + "grad_norm": 0.5973192453384399, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1340 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 0.6181833744049072, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 1350 + }, + { + "epoch": 0.9844372059355773, + "grad_norm": 0.5563396215438843, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1360 + }, + { + "epoch": 0.9916757148027506, + "grad_norm": 0.45723360776901245, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 1370 + }, + { + "epoch": 0.998914223669924, + "grad_norm": 0.5947498679161072, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 1380 + }, + { + "epoch": 0.9996380745566413, + "eval_loss": 1.480796456336975, + "eval_runtime": 27.3103, + "eval_samples_per_second": 15.965, + "eval_steps_per_second": 2.014, + "step": 1381 + }, + { + "epoch": 1.0061527325370974, + "grad_norm": 0.5599952936172485, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 1390 + }, + { + "epoch": 1.0133912414042707, + "grad_norm": 0.5932008028030396, + "learning_rate": 0.0002, + "loss": 1.4991, + "step": 1400 + }, + { + "epoch": 1.020629750271444, + "grad_norm": 0.6194121837615967, + "learning_rate": 0.0002, + "loss": 1.4506, + "step": 1410 + }, + { + "epoch": 1.0278682591386175, + "grad_norm": 0.6995621919631958, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 1420 + }, + { + "epoch": 1.0351067680057908, + "grad_norm": 0.7905810475349426, + "learning_rate": 0.0002, + "loss": 1.4153, + "step": 1430 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 0.7221615314483643, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 1440 + }, + { + "epoch": 1.0495837857401376, + "grad_norm": 0.6170642375946045, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1450 + }, + { + "epoch": 1.056822294607311, + "grad_norm": 0.5844094753265381, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 1460 + }, + { + "epoch": 1.0640608034744843, + "grad_norm": 0.7731822729110718, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 1470 + }, + { + "epoch": 1.0712993123416577, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 1480 + }, + { + "epoch": 1.078537821208831, + "grad_norm": 0.6923259496688843, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 1490 + }, + { + "epoch": 1.0857763300760044, + "grad_norm": 0.6008219122886658, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 1500 + }, + { + "epoch": 1.0930148389431777, + "grad_norm": 0.6450045704841614, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 1510 + }, + { + "epoch": 1.1002533478103511, + "grad_norm": 0.7833753824234009, + "learning_rate": 0.0002, + "loss": 1.3295, + "step": 1520 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 0.5076758861541748, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 1530 + }, + { + "epoch": 1.1147303655446978, + "grad_norm": 0.5661332011222839, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1540 + }, + { + "epoch": 1.1219688744118712, + "grad_norm": 0.6526919603347778, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1550 + }, + { + "epoch": 1.1292073832790446, + "grad_norm": 0.5613082647323608, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1560 + }, + { + "epoch": 1.136445892146218, + "grad_norm": 0.6113885641098022, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 1570 + }, + { + "epoch": 1.1436844010133913, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 1580 + }, + { + "epoch": 1.1509229098805647, + "grad_norm": 0.6146392226219177, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 1590 + }, + { + "epoch": 1.158161418747738, + "grad_norm": 0.6766974329948425, + "learning_rate": 0.0002, + "loss": 1.411, + "step": 1600 + }, + { + "epoch": 1.1653999276149114, + "grad_norm": 0.7621957659721375, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 1610 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 0.6959581971168518, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 1620 + }, + { + "epoch": 1.1798769453492581, + "grad_norm": 0.6691278219223022, + "learning_rate": 0.0002, + "loss": 1.382, + "step": 1630 + }, + { + "epoch": 1.1871154542164315, + "grad_norm": 0.4927774965763092, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1640 + }, + { + "epoch": 1.1943539630836049, + "grad_norm": 0.7724234461784363, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 1650 + }, + { + "epoch": 1.2015924719507782, + "grad_norm": 0.6817787885665894, + "learning_rate": 0.0002, + "loss": 1.4778, + "step": 1660 + }, + { + "epoch": 1.2088309808179516, + "grad_norm": 0.6500699520111084, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 1670 + }, + { + "epoch": 1.216069489685125, + "grad_norm": 0.5703568458557129, + "learning_rate": 0.0002, + "loss": 1.3875, + "step": 1680 + }, + { + "epoch": 1.2233079985522983, + "grad_norm": 0.6261579990386963, + "learning_rate": 0.0002, + "loss": 1.4735, + "step": 1690 + }, + { + "epoch": 1.2305465074194717, + "grad_norm": 0.651713490486145, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 1700 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 0.684399425983429, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 1710 + }, + { + "epoch": 1.2450235251538184, + "grad_norm": 0.6996857523918152, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 1720 + }, + { + "epoch": 1.2522620340209918, + "grad_norm": 0.7102537751197815, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 1730 + }, + { + "epoch": 1.2595005428881652, + "grad_norm": 0.45809897780418396, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 1740 + }, + { + "epoch": 1.2667390517553385, + "grad_norm": 0.6377046704292297, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 1750 + }, + { + "epoch": 1.2739775606225119, + "grad_norm": 0.6965704560279846, + "learning_rate": 0.0002, + "loss": 1.3479, + "step": 1760 + }, + { + "epoch": 1.2812160694896852, + "grad_norm": 0.5688214302062988, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 1770 + }, + { + "epoch": 1.2884545783568586, + "grad_norm": 0.6384190320968628, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.295693087224032, + "grad_norm": 0.5629363656044006, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1790 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 0.6148255467414856, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 1800 + }, + { + "epoch": 1.3101701049583787, + "grad_norm": 0.655580997467041, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 1810 + }, + { + "epoch": 1.3174086138255519, + "grad_norm": 0.5642657279968262, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 1820 + }, + { + "epoch": 1.3246471226927252, + "grad_norm": 0.59607994556427, + "learning_rate": 0.0002, + "loss": 1.3246, + "step": 1830 + }, + { + "epoch": 1.3318856315598986, + "grad_norm": 0.5564199090003967, + "learning_rate": 0.0002, + "loss": 1.3274, + "step": 1840 + }, + { + "epoch": 1.339124140427072, + "grad_norm": 0.6949955821037292, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1850 + }, + { + "epoch": 1.3463626492942453, + "grad_norm": 0.7036856412887573, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 1860 + }, + { + "epoch": 1.3536011581614187, + "grad_norm": 0.722062885761261, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 1.360839667028592, + "grad_norm": 0.6098677515983582, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 1880 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.0002, + "loss": 1.6217, + "step": 1890 + }, + { + "epoch": 1.3753166847629388, + "grad_norm": 0.6974610090255737, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 1900 + }, + { + "epoch": 1.3825551936301121, + "grad_norm": 0.6520763635635376, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 1910 + }, + { + "epoch": 1.3897937024972855, + "grad_norm": 0.6604374647140503, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 1920 + }, + { + "epoch": 1.3970322113644589, + "grad_norm": 0.7364398241043091, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 1930 + }, + { + "epoch": 1.4042707202316322, + "grad_norm": 0.6849475502967834, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 1940 + }, + { + "epoch": 1.4115092290988056, + "grad_norm": 0.6562670469284058, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 1950 + }, + { + "epoch": 1.418747737965979, + "grad_norm": 0.5695616006851196, + "learning_rate": 0.0002, + "loss": 1.4725, + "step": 1960 + }, + { + "epoch": 1.4259862468331523, + "grad_norm": 0.5244464874267578, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1970 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 0.6347293257713318, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 1980 + }, + { + "epoch": 1.440463264567499, + "grad_norm": 0.5528361201286316, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 1990 + }, + { + "epoch": 1.4477017734346724, + "grad_norm": 0.6987585425376892, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 2000 + }, + { + "epoch": 1.4549402823018458, + "grad_norm": 0.6568987369537354, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 2010 + }, + { + "epoch": 1.4621787911690192, + "grad_norm": 0.7665994763374329, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2020 + }, + { + "epoch": 1.4694173000361925, + "grad_norm": 0.5127707123756409, + "learning_rate": 0.0002, + "loss": 1.244, + "step": 2030 + }, + { + "epoch": 1.476655808903366, + "grad_norm": 0.5406824946403503, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 2040 + }, + { + "epoch": 1.4838943177705393, + "grad_norm": 0.5990166664123535, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 2050 + }, + { + "epoch": 1.4911328266377126, + "grad_norm": 0.6186193823814392, + "learning_rate": 0.0002, + "loss": 1.2454, + "step": 2060 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 0.6154307126998901, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2070 + }, + { + "epoch": 1.5056098443720594, + "grad_norm": 0.5606056451797485, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2080 + }, + { + "epoch": 1.5128483532392327, + "grad_norm": 0.5006417036056519, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 2090 + }, + { + "epoch": 1.520086862106406, + "grad_norm": 0.5968486070632935, + "learning_rate": 0.0002, + "loss": 1.4258, + "step": 2100 + }, + { + "epoch": 1.5273253709735795, + "grad_norm": 0.5835496187210083, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 2110 + }, + { + "epoch": 1.5345638798407528, + "grad_norm": 0.6753535270690918, + "learning_rate": 0.0002, + "loss": 1.5443, + "step": 2120 + }, + { + "epoch": 1.5418023887079262, + "grad_norm": 0.7299720644950867, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 2130 + }, + { + "epoch": 1.5490408975750996, + "grad_norm": 0.5105988383293152, + "learning_rate": 0.0002, + "loss": 1.2364, + "step": 2140 + }, + { + "epoch": 1.556279406442273, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.0002, + "loss": 1.4528, + "step": 2150 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 0.6246723532676697, + "learning_rate": 0.0002, + "loss": 1.4563, + "step": 2160 + }, + { + "epoch": 1.5707564241766196, + "grad_norm": 0.7291720509529114, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2170 + }, + { + "epoch": 1.577994933043793, + "grad_norm": 0.678114116191864, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 2180 + }, + { + "epoch": 1.5852334419109664, + "grad_norm": 0.5136260986328125, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2190 + }, + { + "epoch": 1.5924719507781397, + "grad_norm": 0.6359935998916626, + "learning_rate": 0.0002, + "loss": 1.3271, + "step": 2200 + }, + { + "epoch": 1.599710459645313, + "grad_norm": 0.7650278806686401, + "learning_rate": 0.0002, + "loss": 1.4038, + "step": 2210 + }, + { + "epoch": 1.6069489685124865, + "grad_norm": 0.7256110906600952, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 2220 + }, + { + "epoch": 1.6141874773796598, + "grad_norm": 0.688689649105072, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 2230 + }, + { + "epoch": 1.6214259862468332, + "grad_norm": 0.6045311093330383, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 2240 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 0.7064604163169861, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 2250 + }, + { + "epoch": 1.6359030039811797, + "grad_norm": 0.5309562087059021, + "learning_rate": 0.0002, + "loss": 1.3477, + "step": 2260 + }, + { + "epoch": 1.643141512848353, + "grad_norm": 0.5687053203582764, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 2270 + }, + { + "epoch": 1.6503800217155264, + "grad_norm": 0.535872757434845, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2280 + }, + { + "epoch": 1.6576185305826998, + "grad_norm": 0.5502381920814514, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 2290 + }, + { + "epoch": 1.6648570394498732, + "grad_norm": 0.6158602237701416, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2300 + }, + { + "epoch": 1.6720955483170465, + "grad_norm": 0.5804675817489624, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 2310 + }, + { + "epoch": 1.67933405718422, + "grad_norm": 0.600742757320404, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 2320 + }, + { + "epoch": 1.6865725660513933, + "grad_norm": 0.7101941108703613, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 2330 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 0.7507809996604919, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 2340 + }, + { + "epoch": 1.70104958378574, + "grad_norm": 0.768502414226532, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 2350 + }, + { + "epoch": 1.7082880926529134, + "grad_norm": 0.4801851212978363, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 2360 + }, + { + "epoch": 1.7155266015200867, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 2370 + }, + { + "epoch": 1.72276511038726, + "grad_norm": 0.587661862373352, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2380 + }, + { + "epoch": 1.7300036192544335, + "grad_norm": 0.6073525547981262, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2390 + }, + { + "epoch": 1.7372421281216068, + "grad_norm": 0.6950460076332092, + "learning_rate": 0.0002, + "loss": 1.2754, + "step": 2400 + }, + { + "epoch": 1.7444806369887802, + "grad_norm": 0.5981102585792542, + "learning_rate": 0.0002, + "loss": 1.3858, + "step": 2410 + }, + { + "epoch": 1.7517191458559536, + "grad_norm": 0.544570803642273, + "learning_rate": 0.0002, + "loss": 1.4075, + "step": 2420 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 0.5304399728775024, + "learning_rate": 0.0002, + "loss": 1.3861, + "step": 2430 + }, + { + "epoch": 1.7661961635903003, + "grad_norm": 0.7921594977378845, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 2440 + }, + { + "epoch": 1.7734346724574737, + "grad_norm": 0.6084808707237244, + "learning_rate": 0.0002, + "loss": 1.3053, + "step": 2450 + }, + { + "epoch": 1.780673181324647, + "grad_norm": 0.8844701051712036, + "learning_rate": 0.0002, + "loss": 1.3781, + "step": 2460 + }, + { + "epoch": 1.7879116901918204, + "grad_norm": 0.5729258060455322, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 2470 + }, + { + "epoch": 1.7951501990589938, + "grad_norm": 0.6303611993789673, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 2480 + }, + { + "epoch": 1.8023887079261671, + "grad_norm": 0.5627942085266113, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2490 + }, + { + "epoch": 1.8096272167933405, + "grad_norm": 0.6724274158477783, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2500 + }, + { + "epoch": 1.8168657256605139, + "grad_norm": 0.5030826330184937, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 2510 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 0.5504099130630493, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 2520 + }, + { + "epoch": 1.8313427433948606, + "grad_norm": 0.6338945627212524, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 2530 + }, + { + "epoch": 1.838581252262034, + "grad_norm": 0.5902037620544434, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2540 + }, + { + "epoch": 1.8458197611292073, + "grad_norm": 0.48814457654953003, + "learning_rate": 0.0002, + "loss": 1.2961, + "step": 2550 + }, + { + "epoch": 1.8530582699963807, + "grad_norm": 0.6216312646865845, + "learning_rate": 0.0002, + "loss": 1.466, + "step": 2560 + }, + { + "epoch": 1.860296778863554, + "grad_norm": 0.635603666305542, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 2570 + }, + { + "epoch": 1.8675352877307274, + "grad_norm": 0.6938216090202332, + "learning_rate": 0.0002, + "loss": 1.372, + "step": 2580 + }, + { + "epoch": 1.8747737965979008, + "grad_norm": 0.599557638168335, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 2590 + }, + { + "epoch": 1.8820123054650741, + "grad_norm": 0.564424455165863, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 2600 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 0.5430700182914734, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 2610 + }, + { + "epoch": 1.8964893231994209, + "grad_norm": 0.6150169372558594, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2620 + }, + { + "epoch": 1.9037278320665942, + "grad_norm": 0.48159119486808777, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 2630 + }, + { + "epoch": 1.9109663409337676, + "grad_norm": 0.5608997941017151, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 2640 + }, + { + "epoch": 1.918204849800941, + "grad_norm": 0.6454501748085022, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 2650 + }, + { + "epoch": 1.9254433586681143, + "grad_norm": 0.5458073616027832, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2660 + }, + { + "epoch": 1.9326818675352877, + "grad_norm": 0.5328490734100342, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 2670 + }, + { + "epoch": 1.939920376402461, + "grad_norm": 0.6444696187973022, + "learning_rate": 0.0002, + "loss": 1.4971, + "step": 2680 + }, + { + "epoch": 1.9471588852696344, + "grad_norm": 0.7126023769378662, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2690 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 0.5164045095443726, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2700 + }, + { + "epoch": 1.9616359030039812, + "grad_norm": 0.5347061157226562, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2710 + }, + { + "epoch": 1.9688744118711545, + "grad_norm": 0.5297950506210327, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 2720 + }, + { + "epoch": 1.976112920738328, + "grad_norm": 0.6537790298461914, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 2730 + }, + { + "epoch": 1.9833514296055013, + "grad_norm": 0.5536222457885742, + "learning_rate": 0.0002, + "loss": 1.332, + "step": 2740 + }, + { + "epoch": 1.9905899384726746, + "grad_norm": 0.4856105446815491, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 2750 + }, + { + "epoch": 1.997828447339848, + "grad_norm": 0.6642730832099915, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 2760 + }, + { + "epoch": 2.0, + "eval_loss": 1.4366681575775146, + "eval_runtime": 27.3729, + "eval_samples_per_second": 15.928, + "eval_steps_per_second": 2.009, + "step": 2763 + }, + { + "epoch": 2.0050669562070214, + "grad_norm": 0.740253210067749, + "learning_rate": 0.0002, + "loss": 1.4322, + "step": 2770 + }, + { + "epoch": 2.0123054650741947, + "grad_norm": 0.5826276540756226, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 2780 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 0.607356071472168, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 2790 + }, + { + "epoch": 2.0267824828085415, + "grad_norm": 0.5918063521385193, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 2800 + }, + { + "epoch": 2.034020991675715, + "grad_norm": 0.5610089898109436, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 2810 + }, + { + "epoch": 2.041259500542888, + "grad_norm": 0.5869926810264587, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 2820 + }, + { + "epoch": 2.0484980094100615, + "grad_norm": 0.5753467679023743, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 2830 + }, + { + "epoch": 2.055736518277235, + "grad_norm": 0.7096508145332336, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 2840 + }, + { + "epoch": 2.0629750271444083, + "grad_norm": 0.7653635144233704, + "learning_rate": 0.0002, + "loss": 1.1766, + "step": 2850 + }, + { + "epoch": 2.0702135360115816, + "grad_norm": 0.6202841997146606, + "learning_rate": 0.0002, + "loss": 1.2331, + "step": 2860 + }, + { + "epoch": 2.077452044878755, + "grad_norm": 0.6810227632522583, + "learning_rate": 0.0002, + "loss": 1.3298, + "step": 2870 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 0.7481493353843689, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 2880 + }, + { + "epoch": 2.0919290626131017, + "grad_norm": 0.7089637517929077, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 2890 + }, + { + "epoch": 2.099167571480275, + "grad_norm": 0.7472923398017883, + "learning_rate": 0.0002, + "loss": 1.3095, + "step": 2900 + }, + { + "epoch": 2.1064060803474485, + "grad_norm": 0.8135465979576111, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 2910 + }, + { + "epoch": 2.113644589214622, + "grad_norm": 0.6097133159637451, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 2920 + }, + { + "epoch": 2.120883098081795, + "grad_norm": 0.5970117449760437, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 2930 + }, + { + "epoch": 2.1281216069489686, + "grad_norm": 0.6169309616088867, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2940 + }, + { + "epoch": 2.135360115816142, + "grad_norm": 0.9428738355636597, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 2950 + }, + { + "epoch": 2.1425986246833153, + "grad_norm": 0.5671679973602295, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2960 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 0.7007262110710144, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 2970 + }, + { + "epoch": 2.157075642417662, + "grad_norm": 0.6294044256210327, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 2980 + }, + { + "epoch": 2.1643141512848354, + "grad_norm": 0.6105241775512695, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 2990 + }, + { + "epoch": 2.1715526601520088, + "grad_norm": 0.557124137878418, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 3000 + }, + { + "epoch": 2.178791169019182, + "grad_norm": 0.6250392198562622, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 3010 + }, + { + "epoch": 2.1860296778863555, + "grad_norm": 0.645218551158905, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 3020 + }, + { + "epoch": 2.193268186753529, + "grad_norm": 0.9033605456352234, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 3030 + }, + { + "epoch": 2.2005066956207022, + "grad_norm": 0.5325747132301331, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 3040 + }, + { + "epoch": 2.2077452044878756, + "grad_norm": 0.6334700584411621, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 3050 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 0.5206325054168701, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 3060 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5987200140953064, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3070 + }, + { + "epoch": 2.2294607310893957, + "grad_norm": 0.5893264412879944, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 3080 + }, + { + "epoch": 2.236699239956569, + "grad_norm": 0.6869237422943115, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3090 + }, + { + "epoch": 2.2439377488237424, + "grad_norm": 0.5040048360824585, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 3100 + }, + { + "epoch": 2.251176257690916, + "grad_norm": 0.6660613417625427, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 3110 + }, + { + "epoch": 2.258414766558089, + "grad_norm": 0.5890918970108032, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 3120 + }, + { + "epoch": 2.2656532754252625, + "grad_norm": 0.6458896994590759, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3130 + }, + { + "epoch": 2.272891784292436, + "grad_norm": 0.6832690834999084, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 3140 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 0.833908200263977, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 3150 + }, + { + "epoch": 2.2873688020267826, + "grad_norm": 0.4596034586429596, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 3160 + }, + { + "epoch": 2.294607310893956, + "grad_norm": 0.9130966067314148, + "learning_rate": 0.0002, + "loss": 1.449, + "step": 3170 + }, + { + "epoch": 2.3018458197611293, + "grad_norm": 0.7143292427062988, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3180 + }, + { + "epoch": 2.3090843286283027, + "grad_norm": 0.5388900637626648, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 3190 + }, + { + "epoch": 2.316322837495476, + "grad_norm": 0.5607513189315796, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 3200 + }, + { + "epoch": 2.3235613463626494, + "grad_norm": 0.6795142292976379, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 3210 + }, + { + "epoch": 2.330799855229823, + "grad_norm": 0.6561070680618286, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 3220 + }, + { + "epoch": 2.338038364096996, + "grad_norm": 0.8858118057250977, + "learning_rate": 0.0002, + "loss": 1.4636, + "step": 3230 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 0.6604151725769043, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 3240 + }, + { + "epoch": 2.352515381831343, + "grad_norm": 0.6755785346031189, + "learning_rate": 0.0002, + "loss": 1.4004, + "step": 3250 + }, + { + "epoch": 2.3597538906985163, + "grad_norm": 0.6981677412986755, + "learning_rate": 0.0002, + "loss": 1.2503, + "step": 3260 + }, + { + "epoch": 2.3669923995656896, + "grad_norm": 0.6338568329811096, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 2.374230908432863, + "grad_norm": 0.5754265785217285, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 3280 + }, + { + "epoch": 2.3814694173000364, + "grad_norm": 0.7533153295516968, + "learning_rate": 0.0002, + "loss": 1.2924, + "step": 3290 + }, + { + "epoch": 2.3887079261672097, + "grad_norm": 0.675065279006958, + "learning_rate": 0.0002, + "loss": 1.3711, + "step": 3300 + }, + { + "epoch": 2.395946435034383, + "grad_norm": 0.5686452984809875, + "learning_rate": 0.0002, + "loss": 1.3548, + "step": 3310 + }, + { + "epoch": 2.4031849439015565, + "grad_norm": 0.8129481673240662, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 3320 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 0.6615934371948242, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3330 + }, + { + "epoch": 2.417661961635903, + "grad_norm": 0.6678834557533264, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 3340 + }, + { + "epoch": 2.4249004705030766, + "grad_norm": 0.5581308007240295, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 3350 + }, + { + "epoch": 2.43213897937025, + "grad_norm": 0.6098920106887817, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 3360 + }, + { + "epoch": 2.4393774882374233, + "grad_norm": 0.8101736903190613, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3370 + }, + { + "epoch": 2.4466159971045967, + "grad_norm": 0.6621488928794861, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 3380 + }, + { + "epoch": 2.45385450597177, + "grad_norm": 0.8693289160728455, + "learning_rate": 0.0002, + "loss": 1.4579, + "step": 3390 + }, + { + "epoch": 2.4610930148389434, + "grad_norm": 0.6724580526351929, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 3400 + }, + { + "epoch": 2.4683315237061167, + "grad_norm": 0.6776891946792603, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 3410 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 0.7214453816413879, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 3420 + }, + { + "epoch": 2.4828085414404635, + "grad_norm": 0.8390451073646545, + "learning_rate": 0.0002, + "loss": 1.4051, + "step": 3430 + }, + { + "epoch": 2.490047050307637, + "grad_norm": 0.7130982279777527, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 3440 + }, + { + "epoch": 2.49728555917481, + "grad_norm": 0.8873937129974365, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 2.5045240680419836, + "grad_norm": 0.725185751914978, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 3460 + }, + { + "epoch": 2.511762576909157, + "grad_norm": 0.6120352149009705, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3470 + }, + { + "epoch": 2.5190010857763303, + "grad_norm": 0.7713613510131836, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 3480 + }, + { + "epoch": 2.5262395946435037, + "grad_norm": 0.895309567451477, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3490 + }, + { + "epoch": 2.533478103510677, + "grad_norm": 0.9631021022796631, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 3500 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 0.7475683093070984, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3510 + }, + { + "epoch": 2.5479551212450238, + "grad_norm": 0.7271341681480408, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3520 + }, + { + "epoch": 2.555193630112197, + "grad_norm": 0.6979510188102722, + "learning_rate": 0.0002, + "loss": 1.304, + "step": 3530 + }, + { + "epoch": 2.5624321389793705, + "grad_norm": 0.6504196524620056, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 3540 + }, + { + "epoch": 2.569670647846544, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 3550 + }, + { + "epoch": 2.5769091567137172, + "grad_norm": 0.6143222451210022, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3560 + }, + { + "epoch": 2.5841476655808906, + "grad_norm": 0.7245154976844788, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 3570 + }, + { + "epoch": 2.591386174448064, + "grad_norm": 0.943540632724762, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3580 + }, + { + "epoch": 2.5986246833152373, + "grad_norm": 0.7707241773605347, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 3590 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 0.6705001592636108, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 2.613101701049584, + "grad_norm": 0.6360933780670166, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 3610 + }, + { + "epoch": 2.6203402099167574, + "grad_norm": 0.5846424698829651, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 3620 + }, + { + "epoch": 2.6275787187839303, + "grad_norm": 0.5958625674247742, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3630 + }, + { + "epoch": 2.6348172276511037, + "grad_norm": 0.6819243431091309, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 3640 + }, + { + "epoch": 2.642055736518277, + "grad_norm": 0.7033445835113525, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 3650 + }, + { + "epoch": 2.6492942453854504, + "grad_norm": 0.6134849786758423, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 3660 + }, + { + "epoch": 2.656532754252624, + "grad_norm": 0.658009946346283, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 3670 + }, + { + "epoch": 2.663771263119797, + "grad_norm": 0.6280999779701233, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 3680 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 0.5536085963249207, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 3690 + }, + { + "epoch": 2.678248280854144, + "grad_norm": 0.8603981733322144, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 3700 + }, + { + "epoch": 2.6854867897213173, + "grad_norm": 0.5509994626045227, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3710 + }, + { + "epoch": 2.6927252985884906, + "grad_norm": 0.9093621969223022, + "learning_rate": 0.0002, + "loss": 1.3253, + "step": 3720 + }, + { + "epoch": 2.699963807455664, + "grad_norm": 0.7525952458381653, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 3730 + }, + { + "epoch": 2.7072023163228374, + "grad_norm": 0.6737023591995239, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 3740 + }, + { + "epoch": 2.7144408251900107, + "grad_norm": 0.8656924962997437, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 3750 + }, + { + "epoch": 2.721679334057184, + "grad_norm": 0.7494133114814758, + "learning_rate": 0.0002, + "loss": 1.2342, + "step": 3760 + }, + { + "epoch": 2.7289178429243575, + "grad_norm": 0.5725520849227905, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 3770 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 0.836412787437439, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 3780 + }, + { + "epoch": 2.743394860658704, + "grad_norm": 0.6893242597579956, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 3790 + }, + { + "epoch": 2.7506333695258776, + "grad_norm": 0.6696223020553589, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 3800 + }, + { + "epoch": 2.757871878393051, + "grad_norm": 0.6483015418052673, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 3810 + }, + { + "epoch": 2.7651103872602243, + "grad_norm": 0.8084456920623779, + "learning_rate": 0.0002, + "loss": 1.3282, + "step": 3820 + }, + { + "epoch": 2.7723488961273977, + "grad_norm": 0.6601949334144592, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 3830 + }, + { + "epoch": 2.779587404994571, + "grad_norm": 0.6905533671379089, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 3840 + }, + { + "epoch": 2.7868259138617444, + "grad_norm": 0.619318425655365, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 3850 + }, + { + "epoch": 2.7940644227289178, + "grad_norm": 0.5994023084640503, + "learning_rate": 0.0002, + "loss": 1.2551, + "step": 3860 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 0.5627168416976929, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 3870 + }, + { + "epoch": 2.8085414404632645, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 3880 + }, + { + "epoch": 2.815779949330438, + "grad_norm": 0.6022412776947021, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 3890 + }, + { + "epoch": 2.823018458197611, + "grad_norm": 0.6832426190376282, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 3900 + }, + { + "epoch": 2.8302569670647846, + "grad_norm": 0.5936811566352844, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 3910 + }, + { + "epoch": 2.837495475931958, + "grad_norm": 0.6960572600364685, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 3920 + }, + { + "epoch": 2.8447339847991313, + "grad_norm": 0.5913406610488892, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3930 + }, + { + "epoch": 2.8519724936663047, + "grad_norm": 0.678154706954956, + "learning_rate": 0.0002, + "loss": 1.3245, + "step": 3940 + }, + { + "epoch": 2.859211002533478, + "grad_norm": 0.7898936867713928, + "learning_rate": 0.0002, + "loss": 1.366, + "step": 3950 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 0.9234195351600647, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 3960 + }, + { + "epoch": 2.8736880202678248, + "grad_norm": 0.5960825085639954, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 3970 + }, + { + "epoch": 2.880926529134998, + "grad_norm": 0.677118182182312, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 3980 + }, + { + "epoch": 2.8881650380021715, + "grad_norm": 0.6505142450332642, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 3990 + }, + { + "epoch": 2.895403546869345, + "grad_norm": 0.550826907157898, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 4000 + }, + { + "epoch": 2.9026420557365182, + "grad_norm": 0.6209215521812439, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 4010 + }, + { + "epoch": 2.9098805646036916, + "grad_norm": 0.6549018025398254, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 4020 + }, + { + "epoch": 2.917119073470865, + "grad_norm": 0.570682168006897, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 4030 + }, + { + "epoch": 2.9243575823380383, + "grad_norm": 1.1807632446289062, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 4040 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 0.7058857679367065, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 4050 + }, + { + "epoch": 2.938834600072385, + "grad_norm": 0.5542812943458557, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4060 + }, + { + "epoch": 2.9460731089395584, + "grad_norm": 0.63167804479599, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 4070 + }, + { + "epoch": 2.953311617806732, + "grad_norm": 0.5702962279319763, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 4080 + }, + { + "epoch": 2.960550126673905, + "grad_norm": 0.620944082736969, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 4090 + }, + { + "epoch": 2.9677886355410785, + "grad_norm": 0.5866289734840393, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 4100 + }, + { + "epoch": 2.975027144408252, + "grad_norm": 0.560170590877533, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 4110 + }, + { + "epoch": 2.9822656532754253, + "grad_norm": 0.675082802772522, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 4120 + }, + { + "epoch": 2.9895041621425986, + "grad_norm": 0.62708580493927, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 4130 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 0.7893929481506348, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4140 + }, + { + "epoch": 2.9996380745566413, + "eval_loss": 1.4217946529388428, + "eval_runtime": 27.1596, + "eval_samples_per_second": 16.053, + "eval_steps_per_second": 2.025, + "step": 4144 + }, + { + "epoch": 3.0039811798769454, + "grad_norm": 0.7043836116790771, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 4150 + }, + { + "epoch": 3.0112196887441187, + "grad_norm": 0.6806283593177795, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4160 + }, + { + "epoch": 3.018458197611292, + "grad_norm": 0.7684550285339355, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 4170 + }, + { + "epoch": 3.0256967064784654, + "grad_norm": 0.7895237803459167, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4180 + }, + { + "epoch": 3.032935215345639, + "grad_norm": 0.7464531064033508, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 4190 + }, + { + "epoch": 3.040173724212812, + "grad_norm": 0.9358500838279724, + "learning_rate": 0.0002, + "loss": 1.1614, + "step": 4200 + }, + { + "epoch": 3.0474122330799855, + "grad_norm": 1.1066628694534302, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 4210 + }, + { + "epoch": 3.054650741947159, + "grad_norm": 0.6663267612457275, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 4220 + }, + { + "epoch": 3.0618892508143323, + "grad_norm": 0.6669464707374573, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 4230 + }, + { + "epoch": 3.0691277596815056, + "grad_norm": 0.7052164077758789, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 4240 + }, + { + "epoch": 3.076366268548679, + "grad_norm": 0.6118432879447937, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 4250 + }, + { + "epoch": 3.0836047774158524, + "grad_norm": 0.6915903687477112, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 4260 + }, + { + "epoch": 3.0908432862830257, + "grad_norm": 0.7441644668579102, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4270 + }, + { + "epoch": 3.098081795150199, + "grad_norm": 0.823850691318512, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4280 + }, + { + "epoch": 3.1053203040173725, + "grad_norm": 0.9677883386611938, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 4290 + }, + { + "epoch": 3.112558812884546, + "grad_norm": 0.7002579569816589, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 4300 + }, + { + "epoch": 3.119797321751719, + "grad_norm": 0.778789758682251, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 4310 + }, + { + "epoch": 3.1270358306188926, + "grad_norm": 0.7236007452011108, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 4320 + }, + { + "epoch": 3.134274339486066, + "grad_norm": 0.8809133768081665, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 4330 + }, + { + "epoch": 3.1415128483532393, + "grad_norm": 0.7924913167953491, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 3.1487513572204127, + "grad_norm": 0.7437422275543213, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4350 + }, + { + "epoch": 3.155989866087586, + "grad_norm": 0.6428450345993042, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 4360 + }, + { + "epoch": 3.1632283749547594, + "grad_norm": 0.7922873497009277, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 4370 + }, + { + "epoch": 3.1704668838219328, + "grad_norm": 0.5252506732940674, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 4380 + }, + { + "epoch": 3.177705392689106, + "grad_norm": 0.8570457696914673, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 4390 + }, + { + "epoch": 3.1849439015562795, + "grad_norm": 0.7218987345695496, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 4400 + }, + { + "epoch": 3.192182410423453, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4410 + }, + { + "epoch": 3.199420919290626, + "grad_norm": 0.7386137843132019, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 4420 + }, + { + "epoch": 3.2066594281577996, + "grad_norm": 0.6227759122848511, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 4430 + }, + { + "epoch": 3.213897937024973, + "grad_norm": 0.7180278897285461, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 4440 + }, + { + "epoch": 3.2211364458921463, + "grad_norm": 0.745830774307251, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 4450 + }, + { + "epoch": 3.2283749547593197, + "grad_norm": 0.6766072511672974, + "learning_rate": 0.0002, + "loss": 1.234, + "step": 4460 + }, + { + "epoch": 3.235613463626493, + "grad_norm": 0.8325067162513733, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 4470 + }, + { + "epoch": 3.2428519724936664, + "grad_norm": 0.7148305177688599, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 4480 + }, + { + "epoch": 3.25009048136084, + "grad_norm": 0.7752676010131836, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 4490 + }, + { + "epoch": 3.257328990228013, + "grad_norm": 0.6776860952377319, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4500 + }, + { + "epoch": 3.2645674990951865, + "grad_norm": 0.704359769821167, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4510 + }, + { + "epoch": 3.27180600796236, + "grad_norm": 0.6880282163619995, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 4520 + }, + { + "epoch": 3.2790445168295332, + "grad_norm": 0.8179270029067993, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 4530 + }, + { + "epoch": 3.2862830256967066, + "grad_norm": 0.6718448996543884, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 4540 + }, + { + "epoch": 3.29352153456388, + "grad_norm": 0.8300657868385315, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4550 + }, + { + "epoch": 3.3007600434310533, + "grad_norm": 0.6433690786361694, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 4560 + }, + { + "epoch": 3.3079985522982267, + "grad_norm": 0.690262496471405, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4570 + }, + { + "epoch": 3.3152370611654, + "grad_norm": 0.7022852301597595, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 4580 + }, + { + "epoch": 3.3224755700325734, + "grad_norm": 0.6438387632369995, + "learning_rate": 0.0002, + "loss": 1.0844, + "step": 4590 + }, + { + "epoch": 3.329714078899747, + "grad_norm": 0.6866899132728577, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 4600 + }, + { + "epoch": 3.33695258776692, + "grad_norm": 0.8233968019485474, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 4610 + }, + { + "epoch": 3.3441910966340935, + "grad_norm": 0.7251574993133545, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 4620 + }, + { + "epoch": 3.351429605501267, + "grad_norm": 0.7855110168457031, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4630 + }, + { + "epoch": 3.3586681143684403, + "grad_norm": 0.8487356305122375, + "learning_rate": 0.0002, + "loss": 1.2922, + "step": 4640 + }, + { + "epoch": 3.3659066232356136, + "grad_norm": 0.6429011225700378, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 4650 + }, + { + "epoch": 3.373145132102787, + "grad_norm": 0.7095270156860352, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4660 + }, + { + "epoch": 3.3803836409699604, + "grad_norm": 0.6792303323745728, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4670 + }, + { + "epoch": 3.3876221498371337, + "grad_norm": 0.6784825921058655, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 4680 + }, + { + "epoch": 3.394860658704307, + "grad_norm": 0.6362888216972351, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 4690 + }, + { + "epoch": 3.4020991675714805, + "grad_norm": 0.7794778943061829, + "learning_rate": 0.0002, + "loss": 1.2165, + "step": 4700 + }, + { + "epoch": 3.409337676438654, + "grad_norm": 0.7287485003471375, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 4710 + }, + { + "epoch": 3.416576185305827, + "grad_norm": 0.6481451392173767, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 4720 + }, + { + "epoch": 3.4238146941730006, + "grad_norm": 0.9200371503829956, + "learning_rate": 0.0002, + "loss": 1.2121, + "step": 4730 + }, + { + "epoch": 3.431053203040174, + "grad_norm": 1.074180245399475, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 4740 + }, + { + "epoch": 3.438291711907347, + "grad_norm": 0.6722986698150635, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4750 + }, + { + "epoch": 3.44553022077452, + "grad_norm": 0.7945933938026428, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 4760 + }, + { + "epoch": 3.4527687296416936, + "grad_norm": 0.7624640464782715, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 4770 + }, + { + "epoch": 3.460007238508867, + "grad_norm": 0.7763656377792358, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 4780 + }, + { + "epoch": 3.4672457473760403, + "grad_norm": 0.7736947536468506, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 4790 + }, + { + "epoch": 3.4744842562432137, + "grad_norm": 0.8450354933738708, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 4800 + }, + { + "epoch": 3.481722765110387, + "grad_norm": 0.6480133533477783, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 4810 + }, + { + "epoch": 3.4889612739775604, + "grad_norm": 0.8437445759773254, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 4820 + }, + { + "epoch": 3.4961997828447338, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 4830 + }, + { + "epoch": 3.503438291711907, + "grad_norm": 0.8523228168487549, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 4840 + }, + { + "epoch": 3.5106768005790805, + "grad_norm": 0.6236732006072998, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 4850 + }, + { + "epoch": 3.517915309446254, + "grad_norm": 0.7500787377357483, + "learning_rate": 0.0002, + "loss": 1.1926, + "step": 4860 + }, + { + "epoch": 3.5251538183134272, + "grad_norm": 0.7665374875068665, + "learning_rate": 0.0002, + "loss": 1.1998, + "step": 4870 + }, + { + "epoch": 3.5323923271806006, + "grad_norm": 0.787857711315155, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 4880 + }, + { + "epoch": 3.539630836047774, + "grad_norm": 0.970595121383667, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 4890 + }, + { + "epoch": 3.5468693449149473, + "grad_norm": 0.6409347057342529, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 4900 + }, + { + "epoch": 3.5541078537821207, + "grad_norm": 0.888551652431488, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 4910 + }, + { + "epoch": 3.561346362649294, + "grad_norm": 1.0808377265930176, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 4920 + }, + { + "epoch": 3.5685848715164674, + "grad_norm": 0.7501053214073181, + "learning_rate": 0.0002, + "loss": 1.2564, + "step": 4930 + }, + { + "epoch": 3.575823380383641, + "grad_norm": 0.7375240325927734, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 4940 + }, + { + "epoch": 3.583061889250814, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.0002, + "loss": 1.3568, + "step": 4950 + }, + { + "epoch": 3.5903003981179875, + "grad_norm": 0.939337432384491, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 4960 + }, + { + "epoch": 3.597538906985161, + "grad_norm": 0.6717396974563599, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 4970 + }, + { + "epoch": 3.6047774158523342, + "grad_norm": 0.7141643762588501, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4980 + }, + { + "epoch": 3.6120159247195076, + "grad_norm": 0.7109216451644897, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 4990 + }, + { + "epoch": 3.619254433586681, + "grad_norm": 0.7020776867866516, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 5000 + }, + { + "epoch": 3.6264929424538543, + "grad_norm": 0.7158873677253723, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 5010 + }, + { + "epoch": 3.6337314513210277, + "grad_norm": 0.7062035202980042, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 5020 + }, + { + "epoch": 3.640969960188201, + "grad_norm": 0.7081155776977539, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5030 + }, + { + "epoch": 3.6482084690553744, + "grad_norm": 1.2210607528686523, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 5040 + }, + { + "epoch": 3.655446977922548, + "grad_norm": 0.6650236248970032, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5050 + }, + { + "epoch": 3.662685486789721, + "grad_norm": 0.6884829998016357, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 5060 + }, + { + "epoch": 3.6699239956568945, + "grad_norm": 0.7317819595336914, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 5070 + }, + { + "epoch": 3.677162504524068, + "grad_norm": 0.7406691908836365, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 5080 + }, + { + "epoch": 3.6844010133912413, + "grad_norm": 0.9009454250335693, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 5090 + }, + { + "epoch": 3.6916395222584146, + "grad_norm": 0.8189385533332825, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 5100 + }, + { + "epoch": 3.698878031125588, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 5110 + }, + { + "epoch": 3.7061165399927614, + "grad_norm": 0.8593027591705322, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5120 + }, + { + "epoch": 3.7133550488599347, + "grad_norm": 0.8481812477111816, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 5130 + }, + { + "epoch": 3.720593557727108, + "grad_norm": 0.6527451276779175, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 5140 + }, + { + "epoch": 3.7278320665942815, + "grad_norm": 0.9220114350318909, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 5150 + }, + { + "epoch": 3.735070575461455, + "grad_norm": 1.0842019319534302, + "learning_rate": 0.0002, + "loss": 1.2267, + "step": 5160 + }, + { + "epoch": 3.742309084328628, + "grad_norm": 0.965453565120697, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 5170 + }, + { + "epoch": 3.7495475931958016, + "grad_norm": 0.9903319478034973, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 5180 + }, + { + "epoch": 3.756786102062975, + "grad_norm": 0.7434818148612976, + "learning_rate": 0.0002, + "loss": 1.2515, + "step": 5190 + }, + { + "epoch": 3.7640246109301483, + "grad_norm": 0.6717280745506287, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 5200 + }, + { + "epoch": 3.7712631197973217, + "grad_norm": 0.7754665613174438, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 5210 + }, + { + "epoch": 3.778501628664495, + "grad_norm": 1.028374433517456, + "learning_rate": 0.0002, + "loss": 1.305, + "step": 5220 + }, + { + "epoch": 3.7857401375316684, + "grad_norm": 0.6026996374130249, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5230 + }, + { + "epoch": 3.7929786463988417, + "grad_norm": 0.6978490948677063, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 5240 + }, + { + "epoch": 3.800217155266015, + "grad_norm": 0.7303446531295776, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 5250 + }, + { + "epoch": 3.8074556641331885, + "grad_norm": 1.0734210014343262, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 5260 + }, + { + "epoch": 3.814694173000362, + "grad_norm": 0.6383201479911804, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 5270 + }, + { + "epoch": 3.821932681867535, + "grad_norm": 0.7742630243301392, + "learning_rate": 0.0002, + "loss": 1.0904, + "step": 5280 + }, + { + "epoch": 3.8291711907347086, + "grad_norm": 0.8477074503898621, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 5290 + }, + { + "epoch": 3.836409699601882, + "grad_norm": 0.6675317883491516, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 5300 + }, + { + "epoch": 3.8436482084690553, + "grad_norm": 0.7515445351600647, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 5310 + }, + { + "epoch": 3.8508867173362287, + "grad_norm": 1.1441220045089722, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 5320 + }, + { + "epoch": 3.858125226203402, + "grad_norm": 0.7968795895576477, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 5330 + }, + { + "epoch": 3.8653637350705754, + "grad_norm": 0.7842824459075928, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 5340 + }, + { + "epoch": 3.8726022439377488, + "grad_norm": 0.8272225260734558, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5350 + }, + { + "epoch": 3.879840752804922, + "grad_norm": 0.8413397669792175, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 5360 + }, + { + "epoch": 3.8870792616720955, + "grad_norm": 1.141764760017395, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5370 + }, + { + "epoch": 3.894317770539269, + "grad_norm": 0.9826975464820862, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5380 + }, + { + "epoch": 3.9015562794064422, + "grad_norm": 0.8598255515098572, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5390 + }, + { + "epoch": 3.9087947882736156, + "grad_norm": 0.6271058320999146, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 5400 + }, + { + "epoch": 3.916033297140789, + "grad_norm": 0.6379870772361755, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5410 + }, + { + "epoch": 3.9232718060079623, + "grad_norm": 1.0313376188278198, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 5420 + }, + { + "epoch": 3.9305103148751357, + "grad_norm": 0.8220619559288025, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 5430 + }, + { + "epoch": 3.937748823742309, + "grad_norm": 0.7576116919517517, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5440 + }, + { + "epoch": 3.9449873326094824, + "grad_norm": 1.226235032081604, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 5450 + }, + { + "epoch": 3.952225841476656, + "grad_norm": 0.7979229688644409, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 5460 + }, + { + "epoch": 3.959464350343829, + "grad_norm": 0.9911929965019226, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 5470 + }, + { + "epoch": 3.9667028592110025, + "grad_norm": 0.643738865852356, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5480 + }, + { + "epoch": 3.973941368078176, + "grad_norm": 0.682305634021759, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 5490 + }, + { + "epoch": 3.9811798769453492, + "grad_norm": 1.18373441696167, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 5500 + }, + { + "epoch": 3.9884183858125226, + "grad_norm": 0.7190203070640564, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5510 + }, + { + "epoch": 3.995656894679696, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 5520 + }, + { + "epoch": 4.0, + "eval_loss": 1.4252897500991821, + "eval_runtime": 27.235, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 5526 + }, + { + "epoch": 4.002895403546869, + "grad_norm": 0.6353074312210083, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 5530 + }, + { + "epoch": 4.010133912414043, + "grad_norm": 0.7424906492233276, + "learning_rate": 0.0002, + "loss": 1.0326, + "step": 5540 + }, + { + "epoch": 4.017372421281216, + "grad_norm": 0.8856638073921204, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5550 + }, + { + "epoch": 4.024610930148389, + "grad_norm": 0.9627974033355713, + "learning_rate": 0.0002, + "loss": 1.0905, + "step": 5560 + }, + { + "epoch": 4.031849439015563, + "grad_norm": 0.9048978686332703, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5570 + }, + { + "epoch": 4.039087947882736, + "grad_norm": 0.921119213104248, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 5580 + }, + { + "epoch": 4.0463264567499095, + "grad_norm": 0.8654361963272095, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 5590 + }, + { + "epoch": 4.053564965617083, + "grad_norm": 0.7947945594787598, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 5600 + }, + { + "epoch": 4.060803474484256, + "grad_norm": 0.8307326436042786, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 5610 + }, + { + "epoch": 4.06804198335143, + "grad_norm": 0.793273389339447, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 5620 + }, + { + "epoch": 4.075280492218603, + "grad_norm": 0.8748673796653748, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.082519001085776, + "grad_norm": 0.7926856279373169, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 5640 + }, + { + "epoch": 4.08975750995295, + "grad_norm": 0.922645092010498, + "learning_rate": 0.0002, + "loss": 1.044, + "step": 5650 + }, + { + "epoch": 4.096996018820123, + "grad_norm": 0.9539641737937927, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5660 + }, + { + "epoch": 4.1042345276872965, + "grad_norm": 0.8674443364143372, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 5670 + }, + { + "epoch": 4.11147303655447, + "grad_norm": 0.7097609043121338, + "learning_rate": 0.0002, + "loss": 0.9867, + "step": 5680 + }, + { + "epoch": 4.118711545421643, + "grad_norm": 0.8875522613525391, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 5690 + }, + { + "epoch": 4.125950054288817, + "grad_norm": 0.8583634495735168, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 5700 + }, + { + "epoch": 4.13318856315599, + "grad_norm": 0.6736377477645874, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 5710 + }, + { + "epoch": 4.140427072023163, + "grad_norm": 0.9349062442779541, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 5720 + }, + { + "epoch": 4.147665580890337, + "grad_norm": 1.0610365867614746, + "learning_rate": 0.0002, + "loss": 1.0508, + "step": 5730 + }, + { + "epoch": 4.15490408975751, + "grad_norm": 1.5838189125061035, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 5740 + }, + { + "epoch": 4.162142598624683, + "grad_norm": 0.747522234916687, + "learning_rate": 0.0002, + "loss": 1.0222, + "step": 5750 + }, + { + "epoch": 4.169381107491857, + "grad_norm": 1.3247915506362915, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 5760 + }, + { + "epoch": 4.17661961635903, + "grad_norm": 0.8750247955322266, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 5770 + }, + { + "epoch": 4.1838581252262035, + "grad_norm": 0.7914144992828369, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 5780 + }, + { + "epoch": 4.191096634093377, + "grad_norm": 0.9493299126625061, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 5790 + }, + { + "epoch": 4.19833514296055, + "grad_norm": 0.7802295088768005, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 5800 + }, + { + "epoch": 4.205573651827724, + "grad_norm": 0.6987314820289612, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 5810 + }, + { + "epoch": 4.212812160694897, + "grad_norm": 0.9220341444015503, + "learning_rate": 0.0002, + "loss": 1.1699, + "step": 5820 + }, + { + "epoch": 4.22005066956207, + "grad_norm": 0.8932939767837524, + "learning_rate": 0.0002, + "loss": 1.1394, + "step": 5830 + }, + { + "epoch": 4.227289178429244, + "grad_norm": 0.920002818107605, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 5840 + }, + { + "epoch": 4.234527687296417, + "grad_norm": 0.6662752032279968, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 5850 + }, + { + "epoch": 4.24176619616359, + "grad_norm": 0.8679718971252441, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 5860 + }, + { + "epoch": 4.249004705030764, + "grad_norm": 0.7020887732505798, + "learning_rate": 0.0002, + "loss": 0.8991, + "step": 5870 + }, + { + "epoch": 4.256243213897937, + "grad_norm": 0.869611382484436, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 5880 + }, + { + "epoch": 4.2634817227651105, + "grad_norm": 0.7796585559844971, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 5890 + }, + { + "epoch": 4.270720231632284, + "grad_norm": 0.8978819251060486, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 5900 + }, + { + "epoch": 4.277958740499457, + "grad_norm": 1.0837205648422241, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 5910 + }, + { + "epoch": 4.285197249366631, + "grad_norm": 0.7584353089332581, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 5920 + }, + { + "epoch": 4.292435758233804, + "grad_norm": 0.7313185334205627, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5930 + }, + { + "epoch": 4.299674267100977, + "grad_norm": 0.8004671335220337, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 5940 + }, + { + "epoch": 4.306912775968151, + "grad_norm": 2.154958724975586, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 5950 + }, + { + "epoch": 4.314151284835324, + "grad_norm": 0.9163479804992676, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 5960 + }, + { + "epoch": 4.321389793702497, + "grad_norm": 0.9151589274406433, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 5970 + }, + { + "epoch": 4.328628302569671, + "grad_norm": 0.8624112010002136, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 5980 + }, + { + "epoch": 4.335866811436844, + "grad_norm": 0.9357741475105286, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 5990 + }, + { + "epoch": 4.3431053203040175, + "grad_norm": 1.3482335805892944, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 6000 + }, + { + "epoch": 4.350343829171191, + "grad_norm": 0.7156149744987488, + "learning_rate": 0.0002, + "loss": 1.1224, + "step": 6010 + }, + { + "epoch": 4.357582338038364, + "grad_norm": 0.8480049967765808, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6020 + }, + { + "epoch": 4.364820846905538, + "grad_norm": 0.8262244462966919, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 6030 + }, + { + "epoch": 4.372059355772711, + "grad_norm": 0.7733905911445618, + "learning_rate": 0.0002, + "loss": 0.9966, + "step": 6040 + }, + { + "epoch": 4.379297864639884, + "grad_norm": 0.8553919792175293, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 6050 + }, + { + "epoch": 4.386536373507058, + "grad_norm": 0.8666832447052002, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 6060 + }, + { + "epoch": 4.393774882374231, + "grad_norm": 0.9168295860290527, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6070 + }, + { + "epoch": 4.4010133912414044, + "grad_norm": 0.7315238118171692, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 6080 + }, + { + "epoch": 4.408251900108578, + "grad_norm": 1.020263433456421, + "learning_rate": 0.0002, + "loss": 1.1599, + "step": 6090 + }, + { + "epoch": 4.415490408975751, + "grad_norm": 0.9978243708610535, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 6100 + }, + { + "epoch": 4.4227289178429245, + "grad_norm": 0.995453953742981, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 6110 + }, + { + "epoch": 4.429967426710098, + "grad_norm": 0.9360884428024292, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 6120 + }, + { + "epoch": 4.437205935577271, + "grad_norm": 0.8099448084831238, + "learning_rate": 0.0002, + "loss": 0.9506, + "step": 6130 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8173841238021851, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 6140 + }, + { + "epoch": 4.451682953311618, + "grad_norm": 0.7972666025161743, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 6150 + }, + { + "epoch": 4.458921462178791, + "grad_norm": 0.7685779333114624, + "learning_rate": 0.0002, + "loss": 1.0226, + "step": 6160 + }, + { + "epoch": 4.466159971045965, + "grad_norm": 0.7872623801231384, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6170 + }, + { + "epoch": 4.473398479913138, + "grad_norm": 0.7677070498466492, + "learning_rate": 0.0002, + "loss": 0.9911, + "step": 6180 + }, + { + "epoch": 4.4806369887803115, + "grad_norm": 0.7878316044807434, + "learning_rate": 0.0002, + "loss": 1.0919, + "step": 6190 + }, + { + "epoch": 4.487875497647485, + "grad_norm": 0.8178079724311829, + "learning_rate": 0.0002, + "loss": 1.018, + "step": 6200 + }, + { + "epoch": 4.495114006514658, + "grad_norm": 1.2820082902908325, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 6210 + }, + { + "epoch": 4.502352515381832, + "grad_norm": 0.9380832314491272, + "learning_rate": 0.0002, + "loss": 1.3101, + "step": 6220 + }, + { + "epoch": 4.509591024249005, + "grad_norm": 0.7810422778129578, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 6230 + }, + { + "epoch": 4.516829533116178, + "grad_norm": 1.1022917032241821, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6240 + }, + { + "epoch": 4.524068041983352, + "grad_norm": 1.4275553226470947, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 6250 + }, + { + "epoch": 4.531306550850525, + "grad_norm": 0.7597777247428894, + "learning_rate": 0.0002, + "loss": 1.3237, + "step": 6260 + }, + { + "epoch": 4.538545059717698, + "grad_norm": 1.10992431640625, + "learning_rate": 0.0002, + "loss": 1.1529, + "step": 6270 + }, + { + "epoch": 4.545783568584872, + "grad_norm": 0.8981178998947144, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 6280 + }, + { + "epoch": 4.553022077452045, + "grad_norm": 0.7863979339599609, + "learning_rate": 0.0002, + "loss": 1.086, + "step": 6290 + }, + { + "epoch": 4.5602605863192185, + "grad_norm": 0.9071474671363831, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 6300 + }, + { + "epoch": 4.567499095186392, + "grad_norm": 0.7429424524307251, + "learning_rate": 0.0002, + "loss": 1.0916, + "step": 6310 + }, + { + "epoch": 4.574737604053565, + "grad_norm": 1.0767850875854492, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 6320 + }, + { + "epoch": 4.581976112920739, + "grad_norm": 0.7885915637016296, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 6330 + }, + { + "epoch": 4.589214621787912, + "grad_norm": 0.8350457549095154, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 6340 + }, + { + "epoch": 4.596453130655085, + "grad_norm": 0.7853530645370483, + "learning_rate": 0.0002, + "loss": 1.0743, + "step": 6350 + }, + { + "epoch": 4.603691639522259, + "grad_norm": 1.1220661401748657, + "learning_rate": 0.0002, + "loss": 1.1912, + "step": 6360 + }, + { + "epoch": 4.610930148389432, + "grad_norm": 0.7959423065185547, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 6370 + }, + { + "epoch": 4.618168657256605, + "grad_norm": 0.7782652378082275, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 6380 + }, + { + "epoch": 4.625407166123779, + "grad_norm": 0.7882203459739685, + "learning_rate": 0.0002, + "loss": 1.0753, + "step": 6390 + }, + { + "epoch": 4.632645674990952, + "grad_norm": 0.8841899037361145, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 6400 + }, + { + "epoch": 4.6398841838581255, + "grad_norm": 0.7936127781867981, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 6410 + }, + { + "epoch": 4.647122692725299, + "grad_norm": 0.9213966131210327, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 6420 + }, + { + "epoch": 4.654361201592472, + "grad_norm": 0.9246473908424377, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 6430 + }, + { + "epoch": 4.661599710459646, + "grad_norm": 0.766572892665863, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 6440 + }, + { + "epoch": 4.668838219326819, + "grad_norm": 0.8596171736717224, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 6450 + }, + { + "epoch": 4.676076728193992, + "grad_norm": 0.8482751846313477, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6460 + }, + { + "epoch": 4.683315237061166, + "grad_norm": 1.0826905965805054, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 6470 + }, + { + "epoch": 4.690553745928339, + "grad_norm": 1.1048457622528076, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 6480 + }, + { + "epoch": 4.697792254795512, + "grad_norm": 0.9429134726524353, + "learning_rate": 0.0002, + "loss": 1.0514, + "step": 6490 + }, + { + "epoch": 4.705030763662686, + "grad_norm": 0.8587502837181091, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 6500 + }, + { + "epoch": 4.712269272529859, + "grad_norm": 1.0387083292007446, + "learning_rate": 0.0002, + "loss": 1.0969, + "step": 6510 + }, + { + "epoch": 4.7195077813970325, + "grad_norm": 0.7471951842308044, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 6520 + }, + { + "epoch": 4.726746290264206, + "grad_norm": 0.8800424933433533, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6530 + }, + { + "epoch": 4.733984799131379, + "grad_norm": 0.8136811852455139, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 6540 + }, + { + "epoch": 4.741223307998553, + "grad_norm": 0.9910339713096619, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 6550 + }, + { + "epoch": 4.748461816865726, + "grad_norm": 1.0679163932800293, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 6560 + }, + { + "epoch": 4.755700325732899, + "grad_norm": 0.8468248248100281, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 6570 + }, + { + "epoch": 4.762938834600073, + "grad_norm": 0.8771235942840576, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 6580 + }, + { + "epoch": 4.770177343467246, + "grad_norm": 0.7024846076965332, + "learning_rate": 0.0002, + "loss": 1.077, + "step": 6590 + }, + { + "epoch": 4.7774158523344195, + "grad_norm": 0.7836683392524719, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 6600 + }, + { + "epoch": 4.784654361201593, + "grad_norm": 0.7717288136482239, + "learning_rate": 0.0002, + "loss": 1.1006, + "step": 6610 + }, + { + "epoch": 4.791892870068766, + "grad_norm": 0.884183943271637, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 6620 + }, + { + "epoch": 4.7991313789359396, + "grad_norm": 1.383867621421814, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 6630 + }, + { + "epoch": 4.806369887803113, + "grad_norm": 0.9741523861885071, + "learning_rate": 0.0002, + "loss": 1.0861, + "step": 6640 + }, + { + "epoch": 4.813608396670286, + "grad_norm": 0.9723693132400513, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 6650 + }, + { + "epoch": 4.82084690553746, + "grad_norm": 1.8324809074401855, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 6660 + }, + { + "epoch": 4.828085414404633, + "grad_norm": 0.904909074306488, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 6670 + }, + { + "epoch": 4.835323923271806, + "grad_norm": 0.7355411648750305, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 6680 + }, + { + "epoch": 4.84256243213898, + "grad_norm": 0.8934960961341858, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 6690 + }, + { + "epoch": 4.849800941006153, + "grad_norm": 1.4596954584121704, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 6700 + }, + { + "epoch": 4.8570394498733265, + "grad_norm": 0.8310341238975525, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 6710 + }, + { + "epoch": 4.8642779587405, + "grad_norm": 0.9709894061088562, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 6720 + }, + { + "epoch": 4.871516467607673, + "grad_norm": 0.852142333984375, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 6730 + }, + { + "epoch": 4.878754976474847, + "grad_norm": 1.0643625259399414, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 6740 + }, + { + "epoch": 4.88599348534202, + "grad_norm": 0.9419508576393127, + "learning_rate": 0.0002, + "loss": 1.056, + "step": 6750 + }, + { + "epoch": 4.893231994209193, + "grad_norm": 1.1818498373031616, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 6760 + }, + { + "epoch": 4.900470503076367, + "grad_norm": 0.9369569420814514, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 6770 + }, + { + "epoch": 4.90770901194354, + "grad_norm": 0.7012579441070557, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 6780 + }, + { + "epoch": 4.914947520810713, + "grad_norm": 0.9109319448471069, + "learning_rate": 0.0002, + "loss": 1.0926, + "step": 6790 + }, + { + "epoch": 4.922186029677887, + "grad_norm": 0.8077534437179565, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 6800 + }, + { + "epoch": 4.92942453854506, + "grad_norm": 0.7571148872375488, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 6810 + }, + { + "epoch": 4.9366630474122335, + "grad_norm": 0.7325633764266968, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 6820 + }, + { + "epoch": 4.943901556279407, + "grad_norm": 0.8465084433555603, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 6830 + }, + { + "epoch": 4.95114006514658, + "grad_norm": 0.8753737807273865, + "learning_rate": 0.0002, + "loss": 1.153, + "step": 6840 + }, + { + "epoch": 4.958378574013754, + "grad_norm": 0.9421748518943787, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 6850 + }, + { + "epoch": 4.965617082880927, + "grad_norm": 0.8245896697044373, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 6860 + }, + { + "epoch": 4.9728555917481, + "grad_norm": 0.8823089599609375, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 6870 + }, + { + "epoch": 4.980094100615274, + "grad_norm": 0.8406389355659485, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 6880 + }, + { + "epoch": 4.987332609482447, + "grad_norm": 0.9732868075370789, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 6890 + }, + { + "epoch": 4.99457111834962, + "grad_norm": 2.125141143798828, + "learning_rate": 0.0002, + "loss": 1.1776, + "step": 6900 + }, + { + "epoch": 4.999638074556641, + "eval_loss": 1.445176601409912, + "eval_runtime": 27.2351, + "eval_samples_per_second": 16.009, + "eval_steps_per_second": 2.019, + "step": 6907 + }, + { + "epoch": 5.001809627216793, + "grad_norm": 0.9465792775154114, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 6910 + }, + { + "epoch": 5.009048136083966, + "grad_norm": 1.2834891080856323, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 6920 + }, + { + "epoch": 5.01628664495114, + "grad_norm": 1.0297378301620483, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 6930 + }, + { + "epoch": 5.023525153818313, + "grad_norm": 1.1705161333084106, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 6940 + }, + { + "epoch": 5.030763662685486, + "grad_norm": 0.8293961882591248, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 6950 + }, + { + "epoch": 5.03800217155266, + "grad_norm": 1.0422210693359375, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 6960 + }, + { + "epoch": 5.045240680419833, + "grad_norm": 1.116104245185852, + "learning_rate": 0.0002, + "loss": 1.0553, + "step": 6970 + }, + { + "epoch": 5.0524791892870065, + "grad_norm": 1.5118416547775269, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 6980 + }, + { + "epoch": 5.05971769815418, + "grad_norm": 0.8383979797363281, + "learning_rate": 0.0002, + "loss": 0.9969, + "step": 6990 + }, + { + "epoch": 5.066956207021353, + "grad_norm": 1.3378649950027466, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7000 + }, + { + "epoch": 5.0741947158885266, + "grad_norm": 1.1840510368347168, + "learning_rate": 0.0002, + "loss": 1.0212, + "step": 7010 + }, + { + "epoch": 5.0814332247557, + "grad_norm": 1.2354751825332642, + "learning_rate": 0.0002, + "loss": 0.9939, + "step": 7020 + }, + { + "epoch": 5.088671733622873, + "grad_norm": 1.3830451965332031, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 7030 + }, + { + "epoch": 5.095910242490047, + "grad_norm": 0.8101674318313599, + "learning_rate": 0.0002, + "loss": 1.1827, + "step": 7040 + }, + { + "epoch": 5.10314875135722, + "grad_norm": 0.897982656955719, + "learning_rate": 0.0002, + "loss": 0.9255, + "step": 7050 + }, + { + "epoch": 5.110387260224393, + "grad_norm": 1.2049678564071655, + "learning_rate": 0.0002, + "loss": 0.8784, + "step": 7060 + }, + { + "epoch": 5.117625769091567, + "grad_norm": 1.5912116765975952, + "learning_rate": 0.0002, + "loss": 1.0182, + "step": 7070 + }, + { + "epoch": 5.12486427795874, + "grad_norm": 0.9261530041694641, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 7080 + }, + { + "epoch": 5.1321027868259135, + "grad_norm": 1.1454812288284302, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 7090 + }, + { + "epoch": 5.139341295693087, + "grad_norm": 1.0049978494644165, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 7100 + }, + { + "epoch": 5.14657980456026, + "grad_norm": 1.4513251781463623, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 7110 + }, + { + "epoch": 5.153818313427434, + "grad_norm": 0.9800849556922913, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 7120 + }, + { + "epoch": 5.161056822294607, + "grad_norm": 0.9698708653450012, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 7130 + }, + { + "epoch": 5.16829533116178, + "grad_norm": 1.1126646995544434, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 7140 + }, + { + "epoch": 5.175533840028954, + "grad_norm": 0.9248330593109131, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 7150 + }, + { + "epoch": 5.182772348896127, + "grad_norm": 0.7967255711555481, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 7160 + }, + { + "epoch": 5.1900108577633, + "grad_norm": 0.9933333992958069, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 7170 + }, + { + "epoch": 5.197249366630474, + "grad_norm": 1.0080649852752686, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7180 + }, + { + "epoch": 5.204487875497647, + "grad_norm": 1.3954921960830688, + "learning_rate": 0.0002, + "loss": 1.0201, + "step": 7190 + }, + { + "epoch": 5.2117263843648205, + "grad_norm": 1.2386271953582764, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 7200 + }, + { + "epoch": 5.218964893231994, + "grad_norm": 1.2379488945007324, + "learning_rate": 0.0002, + "loss": 0.8863, + "step": 7210 + }, + { + "epoch": 5.226203402099167, + "grad_norm": 0.9882503747940063, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 7220 + }, + { + "epoch": 5.233441910966341, + "grad_norm": 1.1728729009628296, + "learning_rate": 0.0002, + "loss": 0.9834, + "step": 7230 + }, + { + "epoch": 5.240680419833514, + "grad_norm": 0.9849673509597778, + "learning_rate": 0.0002, + "loss": 0.9269, + "step": 7240 + }, + { + "epoch": 5.247918928700687, + "grad_norm": 1.177639365196228, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 7250 + }, + { + "epoch": 5.255157437567861, + "grad_norm": 1.2395055294036865, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 7260 + }, + { + "epoch": 5.262395946435034, + "grad_norm": 1.3999171257019043, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 7270 + }, + { + "epoch": 5.269634455302207, + "grad_norm": 0.7698732018470764, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 7280 + }, + { + "epoch": 5.276872964169381, + "grad_norm": 0.9167453646659851, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 7290 + }, + { + "epoch": 5.284111473036554, + "grad_norm": 1.113830804824829, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 7300 + }, + { + "epoch": 5.2913499819037275, + "grad_norm": 0.9644396901130676, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 7310 + }, + { + "epoch": 5.298588490770901, + "grad_norm": 1.462435007095337, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7320 + }, + { + "epoch": 5.305826999638074, + "grad_norm": 0.9406287670135498, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 7330 + }, + { + "epoch": 5.313065508505248, + "grad_norm": 0.9698247909545898, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 7340 + }, + { + "epoch": 5.320304017372421, + "grad_norm": 1.12003755569458, + "learning_rate": 0.0002, + "loss": 0.915, + "step": 7350 + }, + { + "epoch": 5.327542526239594, + "grad_norm": 1.598681926727295, + "learning_rate": 0.0002, + "loss": 0.9838, + "step": 7360 + }, + { + "epoch": 5.334781035106768, + "grad_norm": 1.0450010299682617, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 7370 + }, + { + "epoch": 5.342019543973941, + "grad_norm": 0.8680008053779602, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 7380 + }, + { + "epoch": 5.349258052841114, + "grad_norm": 1.0115476846694946, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 7390 + }, + { + "epoch": 5.356496561708288, + "grad_norm": 0.9589748382568359, + "learning_rate": 0.0002, + "loss": 1.0702, + "step": 7400 + }, + { + "epoch": 5.363735070575461, + "grad_norm": 0.6729998588562012, + "learning_rate": 0.0002, + "loss": 0.9366, + "step": 7410 + }, + { + "epoch": 5.3709735794426345, + "grad_norm": 0.9246699213981628, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 7420 + }, + { + "epoch": 5.378212088309808, + "grad_norm": 1.1266791820526123, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 7430 + }, + { + "epoch": 5.385450597176981, + "grad_norm": 1.8056942224502563, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 7440 + }, + { + "epoch": 5.392689106044155, + "grad_norm": 0.9802932739257812, + "learning_rate": 0.0002, + "loss": 0.9604, + "step": 7450 + }, + { + "epoch": 5.399927614911328, + "grad_norm": 1.0504707098007202, + "learning_rate": 0.0002, + "loss": 0.9656, + "step": 7460 + }, + { + "epoch": 5.407166123778501, + "grad_norm": 1.1915022134780884, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 7470 + }, + { + "epoch": 5.414404632645675, + "grad_norm": 1.1856611967086792, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 7480 + }, + { + "epoch": 5.421643141512848, + "grad_norm": 1.292152762413025, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 7490 + }, + { + "epoch": 5.4288816503800215, + "grad_norm": 1.2675740718841553, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 7500 + }, + { + "epoch": 5.436120159247195, + "grad_norm": 1.4034695625305176, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7510 + }, + { + "epoch": 5.443358668114368, + "grad_norm": 0.984588623046875, + "learning_rate": 0.0002, + "loss": 1.0318, + "step": 7520 + }, + { + "epoch": 5.450597176981542, + "grad_norm": 0.8419108390808105, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 7530 + }, + { + "epoch": 5.457835685848715, + "grad_norm": 1.0270143747329712, + "learning_rate": 0.0002, + "loss": 1.0499, + "step": 7540 + }, + { + "epoch": 5.465074194715888, + "grad_norm": 2.2158689498901367, + "learning_rate": 0.0002, + "loss": 0.9804, + "step": 7550 + }, + { + "epoch": 5.472312703583062, + "grad_norm": 1.0740524530410767, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 7560 + }, + { + "epoch": 5.479551212450235, + "grad_norm": 1.3804482221603394, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 7570 + }, + { + "epoch": 5.486789721317408, + "grad_norm": 0.9428979754447937, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 7580 + }, + { + "epoch": 5.494028230184582, + "grad_norm": 0.9548295736312866, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 7590 + }, + { + "epoch": 5.501266739051755, + "grad_norm": 1.0691065788269043, + "learning_rate": 0.0002, + "loss": 0.8853, + "step": 7600 + }, + { + "epoch": 5.5085052479189285, + "grad_norm": 1.0987380743026733, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 7610 + }, + { + "epoch": 5.515743756786102, + "grad_norm": 0.9483979344367981, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 7620 + }, + { + "epoch": 5.522982265653275, + "grad_norm": 1.16624915599823, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 7630 + }, + { + "epoch": 5.530220774520449, + "grad_norm": 0.8563777208328247, + "learning_rate": 0.0002, + "loss": 0.8695, + "step": 7640 + }, + { + "epoch": 5.537459283387622, + "grad_norm": 1.268186092376709, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 7650 + }, + { + "epoch": 5.544697792254795, + "grad_norm": 1.0752092599868774, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 7660 + }, + { + "epoch": 5.551936301121969, + "grad_norm": 1.210389256477356, + "learning_rate": 0.0002, + "loss": 0.9344, + "step": 7670 + }, + { + "epoch": 5.559174809989142, + "grad_norm": 1.669063925743103, + "learning_rate": 0.0002, + "loss": 1.0349, + "step": 7680 + }, + { + "epoch": 5.566413318856315, + "grad_norm": 1.038020133972168, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 7690 + }, + { + "epoch": 5.573651827723489, + "grad_norm": 1.316673994064331, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 7700 + }, + { + "epoch": 5.580890336590662, + "grad_norm": 1.029935359954834, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 7710 + }, + { + "epoch": 5.5881288454578355, + "grad_norm": 0.9401940703392029, + "learning_rate": 0.0002, + "loss": 1.0409, + "step": 7720 + }, + { + "epoch": 5.595367354325009, + "grad_norm": 2.4811816215515137, + "learning_rate": 0.0002, + "loss": 0.9272, + "step": 7730 + }, + { + "epoch": 5.602605863192182, + "grad_norm": 1.0329105854034424, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 7740 + }, + { + "epoch": 5.609844372059356, + "grad_norm": 1.479629635810852, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 7750 + }, + { + "epoch": 5.617082880926529, + "grad_norm": 1.9232319593429565, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 7760 + }, + { + "epoch": 5.624321389793702, + "grad_norm": 1.0055509805679321, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 7770 + }, + { + "epoch": 5.631559898660876, + "grad_norm": 1.0037437677383423, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 7780 + }, + { + "epoch": 5.638798407528049, + "grad_norm": 1.4245030879974365, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 7790 + }, + { + "epoch": 5.646036916395222, + "grad_norm": 1.080687403678894, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 7800 + }, + { + "epoch": 5.653275425262396, + "grad_norm": 1.354953408241272, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 7810 + }, + { + "epoch": 5.660513934129569, + "grad_norm": 0.8966761231422424, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7820 + }, + { + "epoch": 5.6677524429967425, + "grad_norm": 1.0675480365753174, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7830 + }, + { + "epoch": 5.674990951863916, + "grad_norm": 1.2104216814041138, + "learning_rate": 0.0002, + "loss": 1.1077, + "step": 7840 + }, + { + "epoch": 5.682229460731089, + "grad_norm": 1.105790376663208, + "learning_rate": 0.0002, + "loss": 0.9627, + "step": 7850 + }, + { + "epoch": 5.689467969598263, + "grad_norm": 1.0915391445159912, + "learning_rate": 0.0002, + "loss": 1.0483, + "step": 7860 + }, + { + "epoch": 5.696706478465436, + "grad_norm": 0.8957812786102295, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 7870 + }, + { + "epoch": 5.703944987332609, + "grad_norm": 1.9189311265945435, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 7880 + }, + { + "epoch": 5.711183496199783, + "grad_norm": 1.0867321491241455, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 7890 + }, + { + "epoch": 5.718422005066956, + "grad_norm": 1.0233147144317627, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 7900 + }, + { + "epoch": 5.7256605139341294, + "grad_norm": 1.16460382938385, + "learning_rate": 0.0002, + "loss": 0.9872, + "step": 7910 + }, + { + "epoch": 5.732899022801303, + "grad_norm": 1.1098358631134033, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 7920 + }, + { + "epoch": 5.740137531668476, + "grad_norm": 0.8555701375007629, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 7930 + }, + { + "epoch": 5.7473760405356495, + "grad_norm": 0.9885705709457397, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 7940 + }, + { + "epoch": 5.754614549402823, + "grad_norm": 0.9184203147888184, + "learning_rate": 0.0002, + "loss": 0.9909, + "step": 7950 + }, + { + "epoch": 5.761853058269996, + "grad_norm": 0.9653698205947876, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7960 + }, + { + "epoch": 5.76909156713717, + "grad_norm": 1.0014251470565796, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 7970 + }, + { + "epoch": 5.776330076004343, + "grad_norm": 1.004701018333435, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 7980 + }, + { + "epoch": 5.783568584871516, + "grad_norm": 0.950577974319458, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7990 + }, + { + "epoch": 5.79080709373869, + "grad_norm": 1.2986834049224854, + "learning_rate": 0.0002, + "loss": 0.9725, + "step": 8000 + }, + { + "epoch": 5.798045602605863, + "grad_norm": 1.3353424072265625, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 8010 + }, + { + "epoch": 5.8052841114730365, + "grad_norm": 0.7650562524795532, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 8020 + }, + { + "epoch": 5.81252262034021, + "grad_norm": 1.0156235694885254, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 8030 + }, + { + "epoch": 5.819761129207383, + "grad_norm": 1.3092900514602661, + "learning_rate": 0.0002, + "loss": 1.0185, + "step": 8040 + }, + { + "epoch": 5.826999638074557, + "grad_norm": 1.184428095817566, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 8050 + }, + { + "epoch": 5.83423814694173, + "grad_norm": 0.979401707649231, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 8060 + }, + { + "epoch": 5.841476655808903, + "grad_norm": 1.3557400703430176, + "learning_rate": 0.0002, + "loss": 0.9721, + "step": 8070 + }, + { + "epoch": 5.848715164676077, + "grad_norm": 0.8429333567619324, + "learning_rate": 0.0002, + "loss": 1.0235, + "step": 8080 + }, + { + "epoch": 5.85595367354325, + "grad_norm": 1.3167692422866821, + "learning_rate": 0.0002, + "loss": 0.952, + "step": 8090 + }, + { + "epoch": 5.863192182410423, + "grad_norm": 0.9750998020172119, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 8100 + }, + { + "epoch": 5.870430691277597, + "grad_norm": 1.1869813203811646, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 8110 + }, + { + "epoch": 5.87766920014477, + "grad_norm": 1.508615255355835, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 8120 + }, + { + "epoch": 5.8849077090119435, + "grad_norm": 0.9439908266067505, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 8130 + }, + { + "epoch": 5.892146217879117, + "grad_norm": 0.910508930683136, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 8140 + }, + { + "epoch": 5.89938472674629, + "grad_norm": 1.111501932144165, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 8150 + }, + { + "epoch": 5.906623235613464, + "grad_norm": 0.726554274559021, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 8160 + }, + { + "epoch": 5.913861744480637, + "grad_norm": 1.1084556579589844, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 8170 + }, + { + "epoch": 5.92110025334781, + "grad_norm": 0.9695167541503906, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 8180 + }, + { + "epoch": 5.928338762214984, + "grad_norm": 1.1169592142105103, + "learning_rate": 0.0002, + "loss": 0.9858, + "step": 8190 + }, + { + "epoch": 5.935577271082157, + "grad_norm": 1.5116780996322632, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 8200 + }, + { + "epoch": 5.94281577994933, + "grad_norm": 1.0073388814926147, + "learning_rate": 0.0002, + "loss": 0.878, + "step": 8210 + }, + { + "epoch": 5.950054288816504, + "grad_norm": 0.9323263168334961, + "learning_rate": 0.0002, + "loss": 1.0462, + "step": 8220 + }, + { + "epoch": 5.957292797683677, + "grad_norm": 0.9422887563705444, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 8230 + }, + { + "epoch": 5.9645313065508505, + "grad_norm": 0.9691047668457031, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 8240 + }, + { + "epoch": 5.971769815418024, + "grad_norm": 0.9650622606277466, + "learning_rate": 0.0002, + "loss": 0.9842, + "step": 8250 + }, + { + "epoch": 5.979008324285197, + "grad_norm": 1.077958345413208, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 8260 + }, + { + "epoch": 5.986246833152371, + "grad_norm": 0.8946306109428406, + "learning_rate": 0.0002, + "loss": 0.9162, + "step": 8270 + }, + { + "epoch": 5.993485342019544, + "grad_norm": 1.34098219871521, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 8280 + }, + { + "epoch": 6.0, + "eval_loss": 1.4714229106903076, + "eval_runtime": 26.301, + "eval_samples_per_second": 16.577, + "eval_steps_per_second": 2.091, + "step": 8289 + }, + { + "epoch": 6.000723850886717, + "grad_norm": 0.9737564325332642, + "learning_rate": 0.0002, + "loss": 1.1403, + "step": 8290 + }, + { + "epoch": 6.007962359753891, + "grad_norm": 1.2205945253372192, + "learning_rate": 0.0002, + "loss": 0.8875, + "step": 8300 + }, + { + "epoch": 6.015200868621064, + "grad_norm": 1.3529434204101562, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 8310 + }, + { + "epoch": 6.022439377488237, + "grad_norm": 1.2300174236297607, + "learning_rate": 0.0002, + "loss": 0.9427, + "step": 8320 + }, + { + "epoch": 6.029677886355411, + "grad_norm": 0.9248194098472595, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 8330 + }, + { + "epoch": 6.036916395222584, + "grad_norm": 1.1140035390853882, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 8340 + }, + { + "epoch": 6.0441549040897575, + "grad_norm": 1.2097352743148804, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 8350 + }, + { + "epoch": 6.051393412956931, + "grad_norm": 0.9472483396530151, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 8360 + }, + { + "epoch": 6.058631921824104, + "grad_norm": 1.0195368528366089, + "learning_rate": 0.0002, + "loss": 0.8865, + "step": 8370 + }, + { + "epoch": 6.065870430691278, + "grad_norm": 1.182735562324524, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 8380 + }, + { + "epoch": 6.073108939558451, + "grad_norm": 1.1042858362197876, + "learning_rate": 0.0002, + "loss": 0.9455, + "step": 8390 + }, + { + "epoch": 6.080347448425624, + "grad_norm": 0.8606401085853577, + "learning_rate": 0.0002, + "loss": 0.9723, + "step": 8400 + }, + { + "epoch": 6.087585957292798, + "grad_norm": 1.1015676259994507, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 8410 + }, + { + "epoch": 6.094824466159971, + "grad_norm": 1.690224289894104, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 8420 + }, + { + "epoch": 6.1020629750271445, + "grad_norm": 1.1928749084472656, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 8430 + }, + { + "epoch": 6.109301483894318, + "grad_norm": 1.0816864967346191, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 8440 + }, + { + "epoch": 6.116539992761491, + "grad_norm": 1.1638226509094238, + "learning_rate": 0.0002, + "loss": 0.8286, + "step": 8450 + }, + { + "epoch": 6.1237785016286646, + "grad_norm": 1.3782968521118164, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 8460 + }, + { + "epoch": 6.131017010495838, + "grad_norm": 1.2030094861984253, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 8470 + }, + { + "epoch": 6.138255519363011, + "grad_norm": 1.3227659463882446, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 8480 + }, + { + "epoch": 6.145494028230185, + "grad_norm": 1.104384422302246, + "learning_rate": 0.0002, + "loss": 0.9175, + "step": 8490 + }, + { + "epoch": 6.152732537097358, + "grad_norm": 1.518805980682373, + "learning_rate": 0.0002, + "loss": 0.861, + "step": 8500 + }, + { + "epoch": 6.159971045964531, + "grad_norm": 1.2029093503952026, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 8510 + }, + { + "epoch": 6.167209554831705, + "grad_norm": 1.2991217374801636, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 8520 + }, + { + "epoch": 6.174448063698878, + "grad_norm": 1.7002956867218018, + "learning_rate": 0.0002, + "loss": 0.9748, + "step": 8530 + }, + { + "epoch": 6.1816865725660515, + "grad_norm": 1.6653581857681274, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 8540 + }, + { + "epoch": 6.188925081433225, + "grad_norm": 1.0493303537368774, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 8550 + }, + { + "epoch": 6.196163590300398, + "grad_norm": 1.539345622062683, + "learning_rate": 0.0002, + "loss": 0.8726, + "step": 8560 + }, + { + "epoch": 6.203402099167572, + "grad_norm": 1.2757070064544678, + "learning_rate": 0.0002, + "loss": 0.9452, + "step": 8570 + }, + { + "epoch": 6.210640608034745, + "grad_norm": 1.2416890859603882, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 8580 + }, + { + "epoch": 6.217879116901918, + "grad_norm": 1.617621898651123, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 8590 + }, + { + "epoch": 6.225117625769092, + "grad_norm": 1.058962106704712, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 8600 + }, + { + "epoch": 6.232356134636265, + "grad_norm": 1.1489088535308838, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 8610 + }, + { + "epoch": 6.239594643503438, + "grad_norm": 0.9391577243804932, + "learning_rate": 0.0002, + "loss": 0.9476, + "step": 8620 + }, + { + "epoch": 6.246833152370612, + "grad_norm": 1.363706111907959, + "learning_rate": 0.0002, + "loss": 0.932, + "step": 8630 + }, + { + "epoch": 6.254071661237785, + "grad_norm": 0.779502809047699, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 8640 + }, + { + "epoch": 6.2613101701049585, + "grad_norm": 2.000821590423584, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 8650 + }, + { + "epoch": 6.268548678972132, + "grad_norm": 1.1521023511886597, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 8660 + }, + { + "epoch": 6.275787187839305, + "grad_norm": 1.3734570741653442, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 8670 + }, + { + "epoch": 6.283025696706479, + "grad_norm": 0.9550670385360718, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8680 + }, + { + "epoch": 6.290264205573652, + "grad_norm": 0.8937032222747803, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 8690 + }, + { + "epoch": 6.297502714440825, + "grad_norm": 1.3352779150009155, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 8700 + }, + { + "epoch": 6.304741223307999, + "grad_norm": 1.3057222366333008, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 8710 + }, + { + "epoch": 6.311979732175172, + "grad_norm": 0.9078314304351807, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 8720 + }, + { + "epoch": 6.319218241042345, + "grad_norm": 1.6663457155227661, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 8730 + }, + { + "epoch": 6.326456749909519, + "grad_norm": 1.2043739557266235, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 8740 + }, + { + "epoch": 6.333695258776692, + "grad_norm": 0.9165967702865601, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 8750 + }, + { + "epoch": 6.3409337676438655, + "grad_norm": 1.016452670097351, + "learning_rate": 0.0002, + "loss": 0.9761, + "step": 8760 + }, + { + "epoch": 6.348172276511039, + "grad_norm": 1.2209261655807495, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 8770 + }, + { + "epoch": 6.355410785378212, + "grad_norm": 1.3380663394927979, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8780 + }, + { + "epoch": 6.362649294245386, + "grad_norm": 2.3311562538146973, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 8790 + }, + { + "epoch": 6.369887803112559, + "grad_norm": 1.0330604314804077, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 8800 + }, + { + "epoch": 6.377126311979732, + "grad_norm": 0.9655511975288391, + "learning_rate": 0.0002, + "loss": 0.98, + "step": 8810 + }, + { + "epoch": 6.384364820846906, + "grad_norm": 1.1065765619277954, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 8820 + }, + { + "epoch": 6.391603329714079, + "grad_norm": 1.2631285190582275, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 8830 + }, + { + "epoch": 6.398841838581252, + "grad_norm": 0.92459636926651, + "learning_rate": 0.0002, + "loss": 0.8989, + "step": 8840 + }, + { + "epoch": 6.406080347448426, + "grad_norm": 0.9982633590698242, + "learning_rate": 0.0002, + "loss": 0.8536, + "step": 8850 + }, + { + "epoch": 6.413318856315599, + "grad_norm": 1.0746768712997437, + "learning_rate": 0.0002, + "loss": 0.8949, + "step": 8860 + }, + { + "epoch": 6.4205573651827725, + "grad_norm": 1.3024073839187622, + "learning_rate": 0.0002, + "loss": 0.8547, + "step": 8870 + }, + { + "epoch": 6.427795874049946, + "grad_norm": 1.2764527797698975, + "learning_rate": 0.0002, + "loss": 0.9618, + "step": 8880 + }, + { + "epoch": 6.435034382917119, + "grad_norm": 0.8318809270858765, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 8890 + }, + { + "epoch": 6.442272891784293, + "grad_norm": 1.7350783348083496, + "learning_rate": 0.0002, + "loss": 0.917, + "step": 8900 + }, + { + "epoch": 6.449511400651466, + "grad_norm": 1.3430488109588623, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 8910 + }, + { + "epoch": 6.456749909518639, + "grad_norm": 1.5907495021820068, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8920 + }, + { + "epoch": 6.463988418385813, + "grad_norm": 1.8579202890396118, + "learning_rate": 0.0002, + "loss": 0.9639, + "step": 8930 + }, + { + "epoch": 6.471226927252986, + "grad_norm": 1.2233413457870483, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 8940 + }, + { + "epoch": 6.4784654361201595, + "grad_norm": 1.009103775024414, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 8950 + }, + { + "epoch": 6.485703944987333, + "grad_norm": 1.1265181303024292, + "learning_rate": 0.0002, + "loss": 0.8969, + "step": 8960 + }, + { + "epoch": 6.492942453854506, + "grad_norm": 1.1733338832855225, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 8970 + }, + { + "epoch": 6.50018096272168, + "grad_norm": 1.0444518327713013, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 8980 + }, + { + "epoch": 6.507419471588853, + "grad_norm": 1.2296479940414429, + "learning_rate": 0.0002, + "loss": 0.9582, + "step": 8990 + }, + { + "epoch": 6.514657980456026, + "grad_norm": 1.370417833328247, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 9000 + }, + { + "epoch": 6.5218964893232, + "grad_norm": 1.4787620306015015, + "learning_rate": 0.0002, + "loss": 0.9787, + "step": 9010 + }, + { + "epoch": 6.529134998190373, + "grad_norm": 0.8550514578819275, + "learning_rate": 0.0002, + "loss": 0.967, + "step": 9020 + }, + { + "epoch": 6.536373507057546, + "grad_norm": 1.2327991724014282, + "learning_rate": 0.0002, + "loss": 0.9755, + "step": 9030 + }, + { + "epoch": 6.54361201592472, + "grad_norm": 1.0915621519088745, + "learning_rate": 0.0002, + "loss": 0.9248, + "step": 9040 + }, + { + "epoch": 6.550850524791893, + "grad_norm": 1.7243309020996094, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 9050 + }, + { + "epoch": 6.5580890336590665, + "grad_norm": 0.954359769821167, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 9060 + }, + { + "epoch": 6.56532754252624, + "grad_norm": 1.066051959991455, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 9070 + }, + { + "epoch": 6.572566051393413, + "grad_norm": 1.200271487236023, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 9080 + }, + { + "epoch": 6.579804560260587, + "grad_norm": 1.4331457614898682, + "learning_rate": 0.0002, + "loss": 0.9788, + "step": 9090 + }, + { + "epoch": 6.58704306912776, + "grad_norm": 1.0892444849014282, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 9100 + }, + { + "epoch": 6.594281577994933, + "grad_norm": 1.849726915359497, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 9110 + }, + { + "epoch": 6.601520086862107, + "grad_norm": 1.1228708028793335, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 9120 + }, + { + "epoch": 6.60875859572928, + "grad_norm": 1.0928595066070557, + "learning_rate": 0.0002, + "loss": 1.0169, + "step": 9130 + }, + { + "epoch": 6.615997104596453, + "grad_norm": 1.2138155698776245, + "learning_rate": 0.0002, + "loss": 0.9342, + "step": 9140 + }, + { + "epoch": 6.623235613463627, + "grad_norm": 1.5155235528945923, + "learning_rate": 0.0002, + "loss": 0.8715, + "step": 9150 + }, + { + "epoch": 6.6304741223308, + "grad_norm": 1.3194212913513184, + "learning_rate": 0.0002, + "loss": 0.9806, + "step": 9160 + }, + { + "epoch": 6.6377126311979735, + "grad_norm": 1.045623779296875, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 9170 + }, + { + "epoch": 6.644951140065147, + "grad_norm": 0.9647570252418518, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 9180 + }, + { + "epoch": 6.65218964893232, + "grad_norm": 1.0818220376968384, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 9190 + }, + { + "epoch": 6.659428157799494, + "grad_norm": 1.2792822122573853, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 9200 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.2764191627502441, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 9210 + }, + { + "epoch": 6.67390517553384, + "grad_norm": 1.0552066564559937, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 9220 + }, + { + "epoch": 6.681143684401014, + "grad_norm": 1.082476019859314, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 9230 + }, + { + "epoch": 6.688382193268187, + "grad_norm": 1.3313323259353638, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 9240 + }, + { + "epoch": 6.69562070213536, + "grad_norm": 1.130048394203186, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 9250 + }, + { + "epoch": 6.702859211002534, + "grad_norm": 1.1997296810150146, + "learning_rate": 0.0002, + "loss": 0.9969, + "step": 9260 + }, + { + "epoch": 6.710097719869707, + "grad_norm": 1.0591834783554077, + "learning_rate": 0.0002, + "loss": 0.8691, + "step": 9270 + }, + { + "epoch": 6.7173362287368805, + "grad_norm": 1.2722901105880737, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 9280 + }, + { + "epoch": 6.724574737604054, + "grad_norm": 1.1150950193405151, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 9290 + }, + { + "epoch": 6.731813246471227, + "grad_norm": 1.1575992107391357, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 9300 + }, + { + "epoch": 6.739051755338401, + "grad_norm": 0.9371691346168518, + "learning_rate": 0.0002, + "loss": 0.9822, + "step": 9310 + }, + { + "epoch": 6.746290264205574, + "grad_norm": 1.4924226999282837, + "learning_rate": 0.0002, + "loss": 0.9773, + "step": 9320 + }, + { + "epoch": 6.753528773072747, + "grad_norm": 1.1524218320846558, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 9330 + }, + { + "epoch": 6.760767281939921, + "grad_norm": 0.9500471949577332, + "learning_rate": 0.0002, + "loss": 0.9271, + "step": 9340 + }, + { + "epoch": 6.768005790807094, + "grad_norm": 1.2062290906906128, + "learning_rate": 0.0002, + "loss": 0.9029, + "step": 9350 + }, + { + "epoch": 6.7752442996742674, + "grad_norm": 1.212631106376648, + "learning_rate": 0.0002, + "loss": 0.9121, + "step": 9360 + }, + { + "epoch": 6.782482808541441, + "grad_norm": 1.9135472774505615, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 9370 + }, + { + "epoch": 6.789721317408614, + "grad_norm": 0.9682775139808655, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 9380 + }, + { + "epoch": 6.7969598262757875, + "grad_norm": 1.1405237913131714, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 9390 + }, + { + "epoch": 6.804198335142961, + "grad_norm": 1.6855751276016235, + "learning_rate": 0.0002, + "loss": 0.8922, + "step": 9400 + }, + { + "epoch": 6.811436844010134, + "grad_norm": 1.6590169668197632, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 9410 + }, + { + "epoch": 6.818675352877308, + "grad_norm": 1.8795170783996582, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 9420 + }, + { + "epoch": 6.825913861744481, + "grad_norm": 1.1087183952331543, + "learning_rate": 0.0002, + "loss": 0.9142, + "step": 9430 + }, + { + "epoch": 6.833152370611654, + "grad_norm": 1.4178446531295776, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 9440 + }, + { + "epoch": 6.840390879478828, + "grad_norm": 1.0792350769042969, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 9450 + }, + { + "epoch": 6.847629388346001, + "grad_norm": 1.2159196138381958, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 9460 + }, + { + "epoch": 6.8548678972131745, + "grad_norm": 0.9998821020126343, + "learning_rate": 0.0002, + "loss": 0.9536, + "step": 9470 + }, + { + "epoch": 6.862106406080348, + "grad_norm": 0.7940687537193298, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 9480 + }, + { + "epoch": 6.869344914947521, + "grad_norm": 0.9572826027870178, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 9490 + }, + { + "epoch": 6.876583423814694, + "grad_norm": 1.1086537837982178, + "learning_rate": 0.0002, + "loss": 0.9611, + "step": 9500 + }, + { + "epoch": 6.883821932681867, + "grad_norm": 1.1934887170791626, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 9510 + }, + { + "epoch": 6.89106044154904, + "grad_norm": 1.207324504852295, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 9520 + }, + { + "epoch": 6.898298950416214, + "grad_norm": 1.1303677558898926, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 9530 + }, + { + "epoch": 6.905537459283387, + "grad_norm": 1.4958926439285278, + "learning_rate": 0.0002, + "loss": 0.9599, + "step": 9540 + }, + { + "epoch": 6.9127759681505605, + "grad_norm": 1.2141553163528442, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 9550 + }, + { + "epoch": 6.920014477017734, + "grad_norm": 1.6544346809387207, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 9560 + }, + { + "epoch": 6.927252985884907, + "grad_norm": 1.0540320873260498, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 9570 + }, + { + "epoch": 6.934491494752081, + "grad_norm": 1.3095581531524658, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 9580 + }, + { + "epoch": 6.941730003619254, + "grad_norm": 1.4509341716766357, + "learning_rate": 0.0002, + "loss": 0.8694, + "step": 9590 + }, + { + "epoch": 6.948968512486427, + "grad_norm": 1.1091740131378174, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 9600 + }, + { + "epoch": 6.956207021353601, + "grad_norm": 1.102929949760437, + "learning_rate": 0.0002, + "loss": 0.9126, + "step": 9610 + }, + { + "epoch": 6.963445530220774, + "grad_norm": 1.1377743482589722, + "learning_rate": 0.0002, + "loss": 0.9622, + "step": 9620 + }, + { + "epoch": 6.970684039087947, + "grad_norm": 1.2070361375808716, + "learning_rate": 0.0002, + "loss": 0.9045, + "step": 9630 + }, + { + "epoch": 6.977922547955121, + "grad_norm": 1.30153489112854, + "learning_rate": 0.0002, + "loss": 0.9714, + "step": 9640 + }, + { + "epoch": 6.985161056822294, + "grad_norm": 1.4641543626785278, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 9650 + }, + { + "epoch": 6.9923995656894675, + "grad_norm": 1.0497819185256958, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 9660 + }, + { + "epoch": 6.999638074556641, + "grad_norm": 1.2500354051589966, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 9670 + }, + { + "epoch": 6.999638074556641, + "eval_loss": 1.518465518951416, + "eval_runtime": 26.4525, + "eval_samples_per_second": 16.482, + "eval_steps_per_second": 2.079, + "step": 9670 + } + ], + "logging_steps": 10, + "max_steps": 11048, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1819281555862323e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/special_tokens_map.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.model b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer_config.json b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_args.bin b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d7b7431bbbe8c9bf29b925bca391a558af5ff8c --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad613885e4f267fc04125f1a836d42cfa796bbe12e536f9ee60c955de02cdb5a +size 5560 diff --git a/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_log.jsonl b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9adfb8801dcfa9f33ddcc9367233e4e892daecc9 --- /dev/null +++ b/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9996380745566413, "step": 1381, "epoch_duration": 1362.1938304901123, "total_accumulated_duration": 1362.1938304901123, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3048.73388671875}, "peak_memory_usage": {"GPU_0": 5628.7490234375}, "avg_memory_reserved": {"GPU_0": 6182.0}, "peak_memory_reserved": {"GPU_0": 6182.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}]} +{"epoch": 2.0, "step": 2763, "epoch_duration": 1324.4045159816742, "total_accumulated_duration": 2686.5983464717865, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-1381", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}]} +{"epoch": 2.9996380745566413, "step": 4144, "epoch_duration": 1325.2042412757874, "total_accumulated_duration": 4011.802587747574, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3048.73388671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-2763", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}]} +{"epoch": 4.0, "step": 5526, "epoch_duration": 1327.1966397762299, "total_accumulated_duration": 5338.999227523804, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}, {"eval_loss": 1.4217946529388428, "eval_runtime": 27.1596, "eval_samples_per_second": 16.053, "eval_steps_per_second": 2.025, "epoch": 2.9996380745566413, "step": 4144}, {"loss": 1.2152, "grad_norm": 0.7043836116790771, "learning_rate": 0.0002, "epoch": 3.0039811798769454, "step": 4150}, {"loss": 1.1664, "grad_norm": 0.6806283593177795, "learning_rate": 0.0002, "epoch": 3.0112196887441187, "step": 4160}, {"loss": 1.292, "grad_norm": 0.7684550285339355, "learning_rate": 0.0002, "epoch": 3.018458197611292, "step": 4170}, {"loss": 1.3467, "grad_norm": 0.7895237803459167, "learning_rate": 0.0002, "epoch": 3.0256967064784654, "step": 4180}, {"loss": 1.1324, "grad_norm": 0.7464531064033508, "learning_rate": 0.0002, "epoch": 3.032935215345639, "step": 4190}, {"loss": 1.1614, "grad_norm": 0.9358500838279724, "learning_rate": 0.0002, "epoch": 3.040173724212812, "step": 4200}, {"loss": 1.1834, "grad_norm": 1.1066628694534302, "learning_rate": 0.0002, "epoch": 3.0474122330799855, "step": 4210}, {"loss": 1.1557, "grad_norm": 0.6663267612457275, "learning_rate": 0.0002, "epoch": 3.054650741947159, "step": 4220}, {"loss": 1.1707, "grad_norm": 0.6669464707374573, "learning_rate": 0.0002, "epoch": 3.0618892508143323, "step": 4230}, {"loss": 1.1841, "grad_norm": 0.7052164077758789, "learning_rate": 0.0002, "epoch": 3.0691277596815056, "step": 4240}, {"loss": 1.2913, "grad_norm": 0.6118432879447937, "learning_rate": 0.0002, "epoch": 3.076366268548679, "step": 4250}, {"loss": 1.1526, "grad_norm": 0.6915903687477112, "learning_rate": 0.0002, "epoch": 3.0836047774158524, "step": 4260}, {"loss": 1.1348, "grad_norm": 0.7441644668579102, "learning_rate": 0.0002, "epoch": 3.0908432862830257, "step": 4270}, {"loss": 1.1672, "grad_norm": 0.823850691318512, "learning_rate": 0.0002, "epoch": 3.098081795150199, "step": 4280}, {"loss": 1.2655, "grad_norm": 0.9677883386611938, "learning_rate": 0.0002, "epoch": 3.1053203040173725, "step": 4290}, {"loss": 1.1794, "grad_norm": 0.7002579569816589, "learning_rate": 0.0002, "epoch": 3.112558812884546, "step": 4300}, {"loss": 1.135, "grad_norm": 0.778789758682251, "learning_rate": 0.0002, "epoch": 3.119797321751719, "step": 4310}, {"loss": 1.0818, "grad_norm": 0.7236007452011108, "learning_rate": 0.0002, "epoch": 3.1270358306188926, "step": 4320}, {"loss": 1.1803, "grad_norm": 0.8809133768081665, "learning_rate": 0.0002, "epoch": 3.134274339486066, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.7924913167953491, "learning_rate": 0.0002, "epoch": 3.1415128483532393, "step": 4340}, {"loss": 1.1413, "grad_norm": 0.7437422275543213, "learning_rate": 0.0002, "epoch": 3.1487513572204127, "step": 4350}, {"loss": 1.2088, "grad_norm": 0.6428450345993042, "learning_rate": 0.0002, "epoch": 3.155989866087586, "step": 4360}, {"loss": 1.3032, "grad_norm": 0.7922873497009277, "learning_rate": 0.0002, "epoch": 3.1632283749547594, "step": 4370}, {"loss": 1.216, "grad_norm": 0.5252506732940674, "learning_rate": 0.0002, "epoch": 3.1704668838219328, "step": 4380}, {"loss": 1.1297, "grad_norm": 0.8570457696914673, "learning_rate": 0.0002, "epoch": 3.177705392689106, "step": 4390}, {"loss": 1.0994, "grad_norm": 0.7218987345695496, "learning_rate": 0.0002, "epoch": 3.1849439015562795, "step": 4400}, {"loss": 1.2891, "grad_norm": 0.6921393275260925, "learning_rate": 0.0002, "epoch": 3.192182410423453, "step": 4410}, {"loss": 1.2668, "grad_norm": 0.7386137843132019, "learning_rate": 0.0002, "epoch": 3.199420919290626, "step": 4420}, {"loss": 1.1654, "grad_norm": 0.6227759122848511, "learning_rate": 0.0002, "epoch": 3.2066594281577996, "step": 4430}, {"loss": 1.1752, "grad_norm": 0.7180278897285461, "learning_rate": 0.0002, "epoch": 3.213897937024973, "step": 4440}, {"loss": 1.1757, "grad_norm": 0.745830774307251, "learning_rate": 0.0002, "epoch": 3.2211364458921463, "step": 4450}, {"loss": 1.234, "grad_norm": 0.6766072511672974, "learning_rate": 0.0002, "epoch": 3.2283749547593197, "step": 4460}, {"loss": 1.1999, "grad_norm": 0.8325067162513733, "learning_rate": 0.0002, "epoch": 3.235613463626493, "step": 4470}, {"loss": 1.1606, "grad_norm": 0.7148305177688599, "learning_rate": 0.0002, "epoch": 3.2428519724936664, "step": 4480}, {"loss": 1.1383, "grad_norm": 0.7752676010131836, "learning_rate": 0.0002, "epoch": 3.25009048136084, "step": 4490}, {"loss": 1.3006, "grad_norm": 0.6776860952377319, "learning_rate": 0.0002, "epoch": 3.257328990228013, "step": 4500}, {"loss": 1.0796, "grad_norm": 0.704359769821167, "learning_rate": 0.0002, "epoch": 3.2645674990951865, "step": 4510}, {"loss": 1.2496, "grad_norm": 0.6880282163619995, "learning_rate": 0.0002, "epoch": 3.27180600796236, "step": 4520}, {"loss": 1.0947, "grad_norm": 0.8179270029067993, "learning_rate": 0.0002, "epoch": 3.2790445168295332, "step": 4530}, {"loss": 1.1909, "grad_norm": 0.6718448996543884, "learning_rate": 0.0002, "epoch": 3.2862830256967066, "step": 4540}, {"loss": 1.2708, "grad_norm": 0.8300657868385315, "learning_rate": 0.0002, "epoch": 3.29352153456388, "step": 4550}, {"loss": 1.2594, "grad_norm": 0.6433690786361694, "learning_rate": 0.0002, "epoch": 3.3007600434310533, "step": 4560}, {"loss": 1.2479, "grad_norm": 0.690262496471405, "learning_rate": 0.0002, "epoch": 3.3079985522982267, "step": 4570}, {"loss": 1.1342, "grad_norm": 0.7022852301597595, "learning_rate": 0.0002, "epoch": 3.3152370611654, "step": 4580}, {"loss": 1.0844, "grad_norm": 0.6438387632369995, "learning_rate": 0.0002, "epoch": 3.3224755700325734, "step": 4590}, {"loss": 1.17, "grad_norm": 0.6866899132728577, "learning_rate": 0.0002, "epoch": 3.329714078899747, "step": 4600}, {"loss": 1.1289, "grad_norm": 0.8233968019485474, "learning_rate": 0.0002, "epoch": 3.33695258776692, "step": 4610}, {"loss": 1.1855, "grad_norm": 0.7251574993133545, "learning_rate": 0.0002, "epoch": 3.3441910966340935, "step": 4620}, {"loss": 1.3403, "grad_norm": 0.7855110168457031, "learning_rate": 0.0002, "epoch": 3.351429605501267, "step": 4630}, {"loss": 1.2922, "grad_norm": 0.8487356305122375, "learning_rate": 0.0002, "epoch": 3.3586681143684403, "step": 4640}, {"loss": 1.2462, "grad_norm": 0.6429011225700378, "learning_rate": 0.0002, "epoch": 3.3659066232356136, "step": 4650}, {"loss": 1.129, "grad_norm": 0.7095270156860352, "learning_rate": 0.0002, "epoch": 3.373145132102787, "step": 4660}, {"loss": 1.262, "grad_norm": 0.6792303323745728, "learning_rate": 0.0002, "epoch": 3.3803836409699604, "step": 4670}, {"loss": 1.256, "grad_norm": 0.6784825921058655, "learning_rate": 0.0002, "epoch": 3.3876221498371337, "step": 4680}, {"loss": 1.0838, "grad_norm": 0.6362888216972351, "learning_rate": 0.0002, "epoch": 3.394860658704307, "step": 4690}, {"loss": 1.2165, "grad_norm": 0.7794778943061829, "learning_rate": 0.0002, "epoch": 3.4020991675714805, "step": 4700}, {"loss": 1.0644, "grad_norm": 0.7287485003471375, "learning_rate": 0.0002, "epoch": 3.409337676438654, "step": 4710}, {"loss": 1.2925, "grad_norm": 0.6481451392173767, "learning_rate": 0.0002, "epoch": 3.416576185305827, "step": 4720}, {"loss": 1.2121, "grad_norm": 0.9200371503829956, "learning_rate": 0.0002, "epoch": 3.4238146941730006, "step": 4730}, {"loss": 1.072, "grad_norm": 1.074180245399475, "learning_rate": 0.0002, "epoch": 3.431053203040174, "step": 4740}, {"loss": 1.0421, "grad_norm": 0.6722986698150635, "learning_rate": 0.0002, "epoch": 3.438291711907347, "step": 4750}, {"loss": 1.2258, "grad_norm": 0.7945933938026428, "learning_rate": 0.0002, "epoch": 3.44553022077452, "step": 4760}, {"loss": 1.0927, "grad_norm": 0.7624640464782715, "learning_rate": 0.0002, "epoch": 3.4527687296416936, "step": 4770}, {"loss": 1.2428, "grad_norm": 0.7763656377792358, "learning_rate": 0.0002, "epoch": 3.460007238508867, "step": 4780}, {"loss": 1.2584, "grad_norm": 0.7736947536468506, "learning_rate": 0.0002, "epoch": 3.4672457473760403, "step": 4790}, {"loss": 1.1953, "grad_norm": 0.8450354933738708, "learning_rate": 0.0002, "epoch": 3.4744842562432137, "step": 4800}, {"loss": 1.1362, "grad_norm": 0.6480133533477783, "learning_rate": 0.0002, "epoch": 3.481722765110387, "step": 4810}, {"loss": 1.1882, "grad_norm": 0.8437445759773254, "learning_rate": 0.0002, "epoch": 3.4889612739775604, "step": 4820}, {"loss": 1.1519, "grad_norm": 0.7781730890274048, "learning_rate": 0.0002, "epoch": 3.4961997828447338, "step": 4830}, {"loss": 1.1836, "grad_norm": 0.8523228168487549, "learning_rate": 0.0002, "epoch": 3.503438291711907, "step": 4840}, {"loss": 1.1672, "grad_norm": 0.6236732006072998, "learning_rate": 0.0002, "epoch": 3.5106768005790805, "step": 4850}, {"loss": 1.1926, "grad_norm": 0.7500787377357483, "learning_rate": 0.0002, "epoch": 3.517915309446254, "step": 4860}, {"loss": 1.1998, "grad_norm": 0.7665374875068665, "learning_rate": 0.0002, "epoch": 3.5251538183134272, "step": 4870}, {"loss": 1.1551, "grad_norm": 0.787857711315155, "learning_rate": 0.0002, "epoch": 3.5323923271806006, "step": 4880}, {"loss": 1.2758, "grad_norm": 0.970595121383667, "learning_rate": 0.0002, "epoch": 3.539630836047774, "step": 4890}, {"loss": 1.1274, "grad_norm": 0.6409347057342529, "learning_rate": 0.0002, "epoch": 3.5468693449149473, "step": 4900}, {"loss": 1.1596, "grad_norm": 0.888551652431488, "learning_rate": 0.0002, "epoch": 3.5541078537821207, "step": 4910}, {"loss": 1.1644, "grad_norm": 1.0808377265930176, "learning_rate": 0.0002, "epoch": 3.561346362649294, "step": 4920}, {"loss": 1.2564, "grad_norm": 0.7501053214073181, "learning_rate": 0.0002, "epoch": 3.5685848715164674, "step": 4930}, {"loss": 1.2351, "grad_norm": 0.7375240325927734, "learning_rate": 0.0002, "epoch": 3.575823380383641, "step": 4940}, {"loss": 1.3568, "grad_norm": 0.7075039744377136, "learning_rate": 0.0002, "epoch": 3.583061889250814, "step": 4950}, {"loss": 1.3355, "grad_norm": 0.939337432384491, "learning_rate": 0.0002, "epoch": 3.5903003981179875, "step": 4960}, {"loss": 1.1722, "grad_norm": 0.6717396974563599, "learning_rate": 0.0002, "epoch": 3.597538906985161, "step": 4970}, {"loss": 1.1186, "grad_norm": 0.7141643762588501, "learning_rate": 0.0002, "epoch": 3.6047774158523342, "step": 4980}, {"loss": 1.1011, "grad_norm": 0.7109216451644897, "learning_rate": 0.0002, "epoch": 3.6120159247195076, "step": 4990}, {"loss": 1.2178, "grad_norm": 0.7020776867866516, "learning_rate": 0.0002, "epoch": 3.619254433586681, "step": 5000}, {"loss": 1.1939, "grad_norm": 0.7158873677253723, "learning_rate": 0.0002, "epoch": 3.6264929424538543, "step": 5010}, {"loss": 1.2624, "grad_norm": 0.7062035202980042, "learning_rate": 0.0002, "epoch": 3.6337314513210277, "step": 5020}, {"loss": 1.0224, "grad_norm": 0.7081155776977539, "learning_rate": 0.0002, "epoch": 3.640969960188201, "step": 5030}, {"loss": 1.2195, "grad_norm": 1.2210607528686523, "learning_rate": 0.0002, "epoch": 3.6482084690553744, "step": 5040}, {"loss": 1.2596, "grad_norm": 0.6650236248970032, "learning_rate": 0.0002, "epoch": 3.655446977922548, "step": 5050}, {"loss": 1.1072, "grad_norm": 0.6884829998016357, "learning_rate": 0.0002, "epoch": 3.662685486789721, "step": 5060}, {"loss": 1.2292, "grad_norm": 0.7317819595336914, "learning_rate": 0.0002, "epoch": 3.6699239956568945, "step": 5070}, {"loss": 1.1917, "grad_norm": 0.7406691908836365, "learning_rate": 0.0002, "epoch": 3.677162504524068, "step": 5080}, {"loss": 1.2949, "grad_norm": 0.9009454250335693, "learning_rate": 0.0002, "epoch": 3.6844010133912413, "step": 5090}, {"loss": 1.1528, "grad_norm": 0.8189385533332825, "learning_rate": 0.0002, "epoch": 3.6916395222584146, "step": 5100}, {"loss": 1.3408, "grad_norm": 1.0793628692626953, "learning_rate": 0.0002, "epoch": 3.698878031125588, "step": 5110}, {"loss": 1.2417, "grad_norm": 0.8593027591705322, "learning_rate": 0.0002, "epoch": 3.7061165399927614, "step": 5120}, {"loss": 1.2141, "grad_norm": 0.8481812477111816, "learning_rate": 0.0002, "epoch": 3.7133550488599347, "step": 5130}, {"loss": 1.125, "grad_norm": 0.6527451276779175, "learning_rate": 0.0002, "epoch": 3.720593557727108, "step": 5140}, {"loss": 1.1584, "grad_norm": 0.9220114350318909, "learning_rate": 0.0002, "epoch": 3.7278320665942815, "step": 5150}, {"loss": 1.2267, "grad_norm": 1.0842019319534302, "learning_rate": 0.0002, "epoch": 3.735070575461455, "step": 5160}, {"loss": 1.3083, "grad_norm": 0.965453565120697, "learning_rate": 0.0002, "epoch": 3.742309084328628, "step": 5170}, {"loss": 1.1772, "grad_norm": 0.9903319478034973, "learning_rate": 0.0002, "epoch": 3.7495475931958016, "step": 5180}, {"loss": 1.2515, "grad_norm": 0.7434818148612976, "learning_rate": 0.0002, "epoch": 3.756786102062975, "step": 5190}, {"loss": 1.2631, "grad_norm": 0.6717280745506287, "learning_rate": 0.0002, "epoch": 3.7640246109301483, "step": 5200}, {"loss": 1.2012, "grad_norm": 0.7754665613174438, "learning_rate": 0.0002, "epoch": 3.7712631197973217, "step": 5210}, {"loss": 1.305, "grad_norm": 1.028374433517456, "learning_rate": 0.0002, "epoch": 3.778501628664495, "step": 5220}, {"loss": 1.1866, "grad_norm": 0.6026996374130249, "learning_rate": 0.0002, "epoch": 3.7857401375316684, "step": 5230}, {"loss": 1.1901, "grad_norm": 0.6978490948677063, "learning_rate": 0.0002, "epoch": 3.7929786463988417, "step": 5240}, {"loss": 1.2576, "grad_norm": 0.7303446531295776, "learning_rate": 0.0002, "epoch": 3.800217155266015, "step": 5250}, {"loss": 1.3173, "grad_norm": 1.0734210014343262, "learning_rate": 0.0002, "epoch": 3.8074556641331885, "step": 5260}, {"loss": 1.1137, "grad_norm": 0.6383201479911804, "learning_rate": 0.0002, "epoch": 3.814694173000362, "step": 5270}, {"loss": 1.0904, "grad_norm": 0.7742630243301392, "learning_rate": 0.0002, "epoch": 3.821932681867535, "step": 5280}, {"loss": 1.2232, "grad_norm": 0.8477074503898621, "learning_rate": 0.0002, "epoch": 3.8291711907347086, "step": 5290}, {"loss": 1.2047, "grad_norm": 0.6675317883491516, "learning_rate": 0.0002, "epoch": 3.836409699601882, "step": 5300}, {"loss": 1.2275, "grad_norm": 0.7515445351600647, "learning_rate": 0.0002, "epoch": 3.8436482084690553, "step": 5310}, {"loss": 1.2569, "grad_norm": 1.1441220045089722, "learning_rate": 0.0002, "epoch": 3.8508867173362287, "step": 5320}, {"loss": 1.1512, "grad_norm": 0.7968795895576477, "learning_rate": 0.0002, "epoch": 3.858125226203402, "step": 5330}, {"loss": 1.232, "grad_norm": 0.7842824459075928, "learning_rate": 0.0002, "epoch": 3.8653637350705754, "step": 5340}, {"loss": 1.1847, "grad_norm": 0.8272225260734558, "learning_rate": 0.0002, "epoch": 3.8726022439377488, "step": 5350}, {"loss": 1.1381, "grad_norm": 0.8413397669792175, "learning_rate": 0.0002, "epoch": 3.879840752804922, "step": 5360}, {"loss": 1.2349, "grad_norm": 1.141764760017395, "learning_rate": 0.0002, "epoch": 3.8870792616720955, "step": 5370}, {"loss": 1.212, "grad_norm": 0.9826975464820862, "learning_rate": 0.0002, "epoch": 3.894317770539269, "step": 5380}, {"loss": 1.1833, "grad_norm": 0.8598255515098572, "learning_rate": 0.0002, "epoch": 3.9015562794064422, "step": 5390}, {"loss": 1.1247, "grad_norm": 0.6271058320999146, "learning_rate": 0.0002, "epoch": 3.9087947882736156, "step": 5400}, {"loss": 1.2212, "grad_norm": 0.6379870772361755, "learning_rate": 0.0002, "epoch": 3.916033297140789, "step": 5410}, {"loss": 1.2481, "grad_norm": 1.0313376188278198, "learning_rate": 0.0002, "epoch": 3.9232718060079623, "step": 5420}, {"loss": 1.1872, "grad_norm": 0.8220619559288025, "learning_rate": 0.0002, "epoch": 3.9305103148751357, "step": 5430}, {"loss": 1.2006, "grad_norm": 0.7576116919517517, "learning_rate": 0.0002, "epoch": 3.937748823742309, "step": 5440}, {"loss": 1.1969, "grad_norm": 1.226235032081604, "learning_rate": 0.0002, "epoch": 3.9449873326094824, "step": 5450}, {"loss": 1.2945, "grad_norm": 0.7979229688644409, "learning_rate": 0.0002, "epoch": 3.952225841476656, "step": 5460}, {"loss": 1.1922, "grad_norm": 0.9911929965019226, "learning_rate": 0.0002, "epoch": 3.959464350343829, "step": 5470}, {"loss": 1.0924, "grad_norm": 0.643738865852356, "learning_rate": 0.0002, "epoch": 3.9667028592110025, "step": 5480}, {"loss": 1.0607, "grad_norm": 0.682305634021759, "learning_rate": 0.0002, "epoch": 3.973941368078176, "step": 5490}, {"loss": 1.2908, "grad_norm": 1.18373441696167, "learning_rate": 0.0002, "epoch": 3.9811798769453492, "step": 5500}, {"loss": 1.0889, "grad_norm": 0.7190203070640564, "learning_rate": 0.0002, "epoch": 3.9884183858125226, "step": 5510}, {"loss": 1.2745, "grad_norm": 0.7516948580741882, "learning_rate": 0.0002, "epoch": 3.995656894679696, "step": 5520}]} +{"epoch": 4.999638074556641, "step": 6907, "epoch_duration": 1329.0368676185608, "total_accumulated_duration": 6668.0360951423645, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3048.73388671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}, {"eval_loss": 1.4217946529388428, "eval_runtime": 27.1596, "eval_samples_per_second": 16.053, "eval_steps_per_second": 2.025, "epoch": 2.9996380745566413, "step": 4144}, {"loss": 1.2152, "grad_norm": 0.7043836116790771, "learning_rate": 0.0002, "epoch": 3.0039811798769454, "step": 4150}, {"loss": 1.1664, "grad_norm": 0.6806283593177795, "learning_rate": 0.0002, "epoch": 3.0112196887441187, "step": 4160}, {"loss": 1.292, "grad_norm": 0.7684550285339355, "learning_rate": 0.0002, "epoch": 3.018458197611292, "step": 4170}, {"loss": 1.3467, "grad_norm": 0.7895237803459167, "learning_rate": 0.0002, "epoch": 3.0256967064784654, "step": 4180}, {"loss": 1.1324, "grad_norm": 0.7464531064033508, "learning_rate": 0.0002, "epoch": 3.032935215345639, "step": 4190}, {"loss": 1.1614, "grad_norm": 0.9358500838279724, "learning_rate": 0.0002, "epoch": 3.040173724212812, "step": 4200}, {"loss": 1.1834, "grad_norm": 1.1066628694534302, "learning_rate": 0.0002, "epoch": 3.0474122330799855, "step": 4210}, {"loss": 1.1557, "grad_norm": 0.6663267612457275, "learning_rate": 0.0002, "epoch": 3.054650741947159, "step": 4220}, {"loss": 1.1707, "grad_norm": 0.6669464707374573, "learning_rate": 0.0002, "epoch": 3.0618892508143323, "step": 4230}, {"loss": 1.1841, "grad_norm": 0.7052164077758789, "learning_rate": 0.0002, "epoch": 3.0691277596815056, "step": 4240}, {"loss": 1.2913, "grad_norm": 0.6118432879447937, "learning_rate": 0.0002, "epoch": 3.076366268548679, "step": 4250}, {"loss": 1.1526, "grad_norm": 0.6915903687477112, "learning_rate": 0.0002, "epoch": 3.0836047774158524, "step": 4260}, {"loss": 1.1348, "grad_norm": 0.7441644668579102, "learning_rate": 0.0002, "epoch": 3.0908432862830257, "step": 4270}, {"loss": 1.1672, "grad_norm": 0.823850691318512, "learning_rate": 0.0002, "epoch": 3.098081795150199, "step": 4280}, {"loss": 1.2655, "grad_norm": 0.9677883386611938, "learning_rate": 0.0002, "epoch": 3.1053203040173725, "step": 4290}, {"loss": 1.1794, "grad_norm": 0.7002579569816589, "learning_rate": 0.0002, "epoch": 3.112558812884546, "step": 4300}, {"loss": 1.135, "grad_norm": 0.778789758682251, "learning_rate": 0.0002, "epoch": 3.119797321751719, "step": 4310}, {"loss": 1.0818, "grad_norm": 0.7236007452011108, "learning_rate": 0.0002, "epoch": 3.1270358306188926, "step": 4320}, {"loss": 1.1803, "grad_norm": 0.8809133768081665, "learning_rate": 0.0002, "epoch": 3.134274339486066, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.7924913167953491, "learning_rate": 0.0002, "epoch": 3.1415128483532393, "step": 4340}, {"loss": 1.1413, "grad_norm": 0.7437422275543213, "learning_rate": 0.0002, "epoch": 3.1487513572204127, "step": 4350}, {"loss": 1.2088, "grad_norm": 0.6428450345993042, "learning_rate": 0.0002, "epoch": 3.155989866087586, "step": 4360}, {"loss": 1.3032, "grad_norm": 0.7922873497009277, "learning_rate": 0.0002, "epoch": 3.1632283749547594, "step": 4370}, {"loss": 1.216, "grad_norm": 0.5252506732940674, "learning_rate": 0.0002, "epoch": 3.1704668838219328, "step": 4380}, {"loss": 1.1297, "grad_norm": 0.8570457696914673, "learning_rate": 0.0002, "epoch": 3.177705392689106, "step": 4390}, {"loss": 1.0994, "grad_norm": 0.7218987345695496, "learning_rate": 0.0002, "epoch": 3.1849439015562795, "step": 4400}, {"loss": 1.2891, "grad_norm": 0.6921393275260925, "learning_rate": 0.0002, "epoch": 3.192182410423453, "step": 4410}, {"loss": 1.2668, "grad_norm": 0.7386137843132019, "learning_rate": 0.0002, "epoch": 3.199420919290626, "step": 4420}, {"loss": 1.1654, "grad_norm": 0.6227759122848511, "learning_rate": 0.0002, "epoch": 3.2066594281577996, "step": 4430}, {"loss": 1.1752, "grad_norm": 0.7180278897285461, "learning_rate": 0.0002, "epoch": 3.213897937024973, "step": 4440}, {"loss": 1.1757, "grad_norm": 0.745830774307251, "learning_rate": 0.0002, "epoch": 3.2211364458921463, "step": 4450}, {"loss": 1.234, "grad_norm": 0.6766072511672974, "learning_rate": 0.0002, "epoch": 3.2283749547593197, "step": 4460}, {"loss": 1.1999, "grad_norm": 0.8325067162513733, "learning_rate": 0.0002, "epoch": 3.235613463626493, "step": 4470}, {"loss": 1.1606, "grad_norm": 0.7148305177688599, "learning_rate": 0.0002, "epoch": 3.2428519724936664, "step": 4480}, {"loss": 1.1383, "grad_norm": 0.7752676010131836, "learning_rate": 0.0002, "epoch": 3.25009048136084, "step": 4490}, {"loss": 1.3006, "grad_norm": 0.6776860952377319, "learning_rate": 0.0002, "epoch": 3.257328990228013, "step": 4500}, {"loss": 1.0796, "grad_norm": 0.704359769821167, "learning_rate": 0.0002, "epoch": 3.2645674990951865, "step": 4510}, {"loss": 1.2496, "grad_norm": 0.6880282163619995, "learning_rate": 0.0002, "epoch": 3.27180600796236, "step": 4520}, {"loss": 1.0947, "grad_norm": 0.8179270029067993, "learning_rate": 0.0002, "epoch": 3.2790445168295332, "step": 4530}, {"loss": 1.1909, "grad_norm": 0.6718448996543884, "learning_rate": 0.0002, "epoch": 3.2862830256967066, "step": 4540}, {"loss": 1.2708, "grad_norm": 0.8300657868385315, "learning_rate": 0.0002, "epoch": 3.29352153456388, "step": 4550}, {"loss": 1.2594, "grad_norm": 0.6433690786361694, "learning_rate": 0.0002, "epoch": 3.3007600434310533, "step": 4560}, {"loss": 1.2479, "grad_norm": 0.690262496471405, "learning_rate": 0.0002, "epoch": 3.3079985522982267, "step": 4570}, {"loss": 1.1342, "grad_norm": 0.7022852301597595, "learning_rate": 0.0002, "epoch": 3.3152370611654, "step": 4580}, {"loss": 1.0844, "grad_norm": 0.6438387632369995, "learning_rate": 0.0002, "epoch": 3.3224755700325734, "step": 4590}, {"loss": 1.17, "grad_norm": 0.6866899132728577, "learning_rate": 0.0002, "epoch": 3.329714078899747, "step": 4600}, {"loss": 1.1289, "grad_norm": 0.8233968019485474, "learning_rate": 0.0002, "epoch": 3.33695258776692, "step": 4610}, {"loss": 1.1855, "grad_norm": 0.7251574993133545, "learning_rate": 0.0002, "epoch": 3.3441910966340935, "step": 4620}, {"loss": 1.3403, "grad_norm": 0.7855110168457031, "learning_rate": 0.0002, "epoch": 3.351429605501267, "step": 4630}, {"loss": 1.2922, "grad_norm": 0.8487356305122375, "learning_rate": 0.0002, "epoch": 3.3586681143684403, "step": 4640}, {"loss": 1.2462, "grad_norm": 0.6429011225700378, "learning_rate": 0.0002, "epoch": 3.3659066232356136, "step": 4650}, {"loss": 1.129, "grad_norm": 0.7095270156860352, "learning_rate": 0.0002, "epoch": 3.373145132102787, "step": 4660}, {"loss": 1.262, "grad_norm": 0.6792303323745728, "learning_rate": 0.0002, "epoch": 3.3803836409699604, "step": 4670}, {"loss": 1.256, "grad_norm": 0.6784825921058655, "learning_rate": 0.0002, "epoch": 3.3876221498371337, "step": 4680}, {"loss": 1.0838, "grad_norm": 0.6362888216972351, "learning_rate": 0.0002, "epoch": 3.394860658704307, "step": 4690}, {"loss": 1.2165, "grad_norm": 0.7794778943061829, "learning_rate": 0.0002, "epoch": 3.4020991675714805, "step": 4700}, {"loss": 1.0644, "grad_norm": 0.7287485003471375, "learning_rate": 0.0002, "epoch": 3.409337676438654, "step": 4710}, {"loss": 1.2925, "grad_norm": 0.6481451392173767, "learning_rate": 0.0002, "epoch": 3.416576185305827, "step": 4720}, {"loss": 1.2121, "grad_norm": 0.9200371503829956, "learning_rate": 0.0002, "epoch": 3.4238146941730006, "step": 4730}, {"loss": 1.072, "grad_norm": 1.074180245399475, "learning_rate": 0.0002, "epoch": 3.431053203040174, "step": 4740}, {"loss": 1.0421, "grad_norm": 0.6722986698150635, "learning_rate": 0.0002, "epoch": 3.438291711907347, "step": 4750}, {"loss": 1.2258, "grad_norm": 0.7945933938026428, "learning_rate": 0.0002, "epoch": 3.44553022077452, "step": 4760}, {"loss": 1.0927, "grad_norm": 0.7624640464782715, "learning_rate": 0.0002, "epoch": 3.4527687296416936, "step": 4770}, {"loss": 1.2428, "grad_norm": 0.7763656377792358, "learning_rate": 0.0002, "epoch": 3.460007238508867, "step": 4780}, {"loss": 1.2584, "grad_norm": 0.7736947536468506, "learning_rate": 0.0002, "epoch": 3.4672457473760403, "step": 4790}, {"loss": 1.1953, "grad_norm": 0.8450354933738708, "learning_rate": 0.0002, "epoch": 3.4744842562432137, "step": 4800}, {"loss": 1.1362, "grad_norm": 0.6480133533477783, "learning_rate": 0.0002, "epoch": 3.481722765110387, "step": 4810}, {"loss": 1.1882, "grad_norm": 0.8437445759773254, "learning_rate": 0.0002, "epoch": 3.4889612739775604, "step": 4820}, {"loss": 1.1519, "grad_norm": 0.7781730890274048, "learning_rate": 0.0002, "epoch": 3.4961997828447338, "step": 4830}, {"loss": 1.1836, "grad_norm": 0.8523228168487549, "learning_rate": 0.0002, "epoch": 3.503438291711907, "step": 4840}, {"loss": 1.1672, "grad_norm": 0.6236732006072998, "learning_rate": 0.0002, "epoch": 3.5106768005790805, "step": 4850}, {"loss": 1.1926, "grad_norm": 0.7500787377357483, "learning_rate": 0.0002, "epoch": 3.517915309446254, "step": 4860}, {"loss": 1.1998, "grad_norm": 0.7665374875068665, "learning_rate": 0.0002, "epoch": 3.5251538183134272, "step": 4870}, {"loss": 1.1551, "grad_norm": 0.787857711315155, "learning_rate": 0.0002, "epoch": 3.5323923271806006, "step": 4880}, {"loss": 1.2758, "grad_norm": 0.970595121383667, "learning_rate": 0.0002, "epoch": 3.539630836047774, "step": 4890}, {"loss": 1.1274, "grad_norm": 0.6409347057342529, "learning_rate": 0.0002, "epoch": 3.5468693449149473, "step": 4900}, {"loss": 1.1596, "grad_norm": 0.888551652431488, "learning_rate": 0.0002, "epoch": 3.5541078537821207, "step": 4910}, {"loss": 1.1644, "grad_norm": 1.0808377265930176, "learning_rate": 0.0002, "epoch": 3.561346362649294, "step": 4920}, {"loss": 1.2564, "grad_norm": 0.7501053214073181, "learning_rate": 0.0002, "epoch": 3.5685848715164674, "step": 4930}, {"loss": 1.2351, "grad_norm": 0.7375240325927734, "learning_rate": 0.0002, "epoch": 3.575823380383641, "step": 4940}, {"loss": 1.3568, "grad_norm": 0.7075039744377136, "learning_rate": 0.0002, "epoch": 3.583061889250814, "step": 4950}, {"loss": 1.3355, "grad_norm": 0.939337432384491, "learning_rate": 0.0002, "epoch": 3.5903003981179875, "step": 4960}, {"loss": 1.1722, "grad_norm": 0.6717396974563599, "learning_rate": 0.0002, "epoch": 3.597538906985161, "step": 4970}, {"loss": 1.1186, "grad_norm": 0.7141643762588501, "learning_rate": 0.0002, "epoch": 3.6047774158523342, "step": 4980}, {"loss": 1.1011, "grad_norm": 0.7109216451644897, "learning_rate": 0.0002, "epoch": 3.6120159247195076, "step": 4990}, {"loss": 1.2178, "grad_norm": 0.7020776867866516, "learning_rate": 0.0002, "epoch": 3.619254433586681, "step": 5000}, {"loss": 1.1939, "grad_norm": 0.7158873677253723, "learning_rate": 0.0002, "epoch": 3.6264929424538543, "step": 5010}, {"loss": 1.2624, "grad_norm": 0.7062035202980042, "learning_rate": 0.0002, "epoch": 3.6337314513210277, "step": 5020}, {"loss": 1.0224, "grad_norm": 0.7081155776977539, "learning_rate": 0.0002, "epoch": 3.640969960188201, "step": 5030}, {"loss": 1.2195, "grad_norm": 1.2210607528686523, "learning_rate": 0.0002, "epoch": 3.6482084690553744, "step": 5040}, {"loss": 1.2596, "grad_norm": 0.6650236248970032, "learning_rate": 0.0002, "epoch": 3.655446977922548, "step": 5050}, {"loss": 1.1072, "grad_norm": 0.6884829998016357, "learning_rate": 0.0002, "epoch": 3.662685486789721, "step": 5060}, {"loss": 1.2292, "grad_norm": 0.7317819595336914, "learning_rate": 0.0002, "epoch": 3.6699239956568945, "step": 5070}, {"loss": 1.1917, "grad_norm": 0.7406691908836365, "learning_rate": 0.0002, "epoch": 3.677162504524068, "step": 5080}, {"loss": 1.2949, "grad_norm": 0.9009454250335693, "learning_rate": 0.0002, "epoch": 3.6844010133912413, "step": 5090}, {"loss": 1.1528, "grad_norm": 0.8189385533332825, "learning_rate": 0.0002, "epoch": 3.6916395222584146, "step": 5100}, {"loss": 1.3408, "grad_norm": 1.0793628692626953, "learning_rate": 0.0002, "epoch": 3.698878031125588, "step": 5110}, {"loss": 1.2417, "grad_norm": 0.8593027591705322, "learning_rate": 0.0002, "epoch": 3.7061165399927614, "step": 5120}, {"loss": 1.2141, "grad_norm": 0.8481812477111816, "learning_rate": 0.0002, "epoch": 3.7133550488599347, "step": 5130}, {"loss": 1.125, "grad_norm": 0.6527451276779175, "learning_rate": 0.0002, "epoch": 3.720593557727108, "step": 5140}, {"loss": 1.1584, "grad_norm": 0.9220114350318909, "learning_rate": 0.0002, "epoch": 3.7278320665942815, "step": 5150}, {"loss": 1.2267, "grad_norm": 1.0842019319534302, "learning_rate": 0.0002, "epoch": 3.735070575461455, "step": 5160}, {"loss": 1.3083, "grad_norm": 0.965453565120697, "learning_rate": 0.0002, "epoch": 3.742309084328628, "step": 5170}, {"loss": 1.1772, "grad_norm": 0.9903319478034973, "learning_rate": 0.0002, "epoch": 3.7495475931958016, "step": 5180}, {"loss": 1.2515, "grad_norm": 0.7434818148612976, "learning_rate": 0.0002, "epoch": 3.756786102062975, "step": 5190}, {"loss": 1.2631, "grad_norm": 0.6717280745506287, "learning_rate": 0.0002, "epoch": 3.7640246109301483, "step": 5200}, {"loss": 1.2012, "grad_norm": 0.7754665613174438, "learning_rate": 0.0002, "epoch": 3.7712631197973217, "step": 5210}, {"loss": 1.305, "grad_norm": 1.028374433517456, "learning_rate": 0.0002, "epoch": 3.778501628664495, "step": 5220}, {"loss": 1.1866, "grad_norm": 0.6026996374130249, "learning_rate": 0.0002, "epoch": 3.7857401375316684, "step": 5230}, {"loss": 1.1901, "grad_norm": 0.6978490948677063, "learning_rate": 0.0002, "epoch": 3.7929786463988417, "step": 5240}, {"loss": 1.2576, "grad_norm": 0.7303446531295776, "learning_rate": 0.0002, "epoch": 3.800217155266015, "step": 5250}, {"loss": 1.3173, "grad_norm": 1.0734210014343262, "learning_rate": 0.0002, "epoch": 3.8074556641331885, "step": 5260}, {"loss": 1.1137, "grad_norm": 0.6383201479911804, "learning_rate": 0.0002, "epoch": 3.814694173000362, "step": 5270}, {"loss": 1.0904, "grad_norm": 0.7742630243301392, "learning_rate": 0.0002, "epoch": 3.821932681867535, "step": 5280}, {"loss": 1.2232, "grad_norm": 0.8477074503898621, "learning_rate": 0.0002, "epoch": 3.8291711907347086, "step": 5290}, {"loss": 1.2047, "grad_norm": 0.6675317883491516, "learning_rate": 0.0002, "epoch": 3.836409699601882, "step": 5300}, {"loss": 1.2275, "grad_norm": 0.7515445351600647, "learning_rate": 0.0002, "epoch": 3.8436482084690553, "step": 5310}, {"loss": 1.2569, "grad_norm": 1.1441220045089722, "learning_rate": 0.0002, "epoch": 3.8508867173362287, "step": 5320}, {"loss": 1.1512, "grad_norm": 0.7968795895576477, "learning_rate": 0.0002, "epoch": 3.858125226203402, "step": 5330}, {"loss": 1.232, "grad_norm": 0.7842824459075928, "learning_rate": 0.0002, "epoch": 3.8653637350705754, "step": 5340}, {"loss": 1.1847, "grad_norm": 0.8272225260734558, "learning_rate": 0.0002, "epoch": 3.8726022439377488, "step": 5350}, {"loss": 1.1381, "grad_norm": 0.8413397669792175, "learning_rate": 0.0002, "epoch": 3.879840752804922, "step": 5360}, {"loss": 1.2349, "grad_norm": 1.141764760017395, "learning_rate": 0.0002, "epoch": 3.8870792616720955, "step": 5370}, {"loss": 1.212, "grad_norm": 0.9826975464820862, "learning_rate": 0.0002, "epoch": 3.894317770539269, "step": 5380}, {"loss": 1.1833, "grad_norm": 0.8598255515098572, "learning_rate": 0.0002, "epoch": 3.9015562794064422, "step": 5390}, {"loss": 1.1247, "grad_norm": 0.6271058320999146, "learning_rate": 0.0002, "epoch": 3.9087947882736156, "step": 5400}, {"loss": 1.2212, "grad_norm": 0.6379870772361755, "learning_rate": 0.0002, "epoch": 3.916033297140789, "step": 5410}, {"loss": 1.2481, "grad_norm": 1.0313376188278198, "learning_rate": 0.0002, "epoch": 3.9232718060079623, "step": 5420}, {"loss": 1.1872, "grad_norm": 0.8220619559288025, "learning_rate": 0.0002, "epoch": 3.9305103148751357, "step": 5430}, {"loss": 1.2006, "grad_norm": 0.7576116919517517, "learning_rate": 0.0002, "epoch": 3.937748823742309, "step": 5440}, {"loss": 1.1969, "grad_norm": 1.226235032081604, "learning_rate": 0.0002, "epoch": 3.9449873326094824, "step": 5450}, {"loss": 1.2945, "grad_norm": 0.7979229688644409, "learning_rate": 0.0002, "epoch": 3.952225841476656, "step": 5460}, {"loss": 1.1922, "grad_norm": 0.9911929965019226, "learning_rate": 0.0002, "epoch": 3.959464350343829, "step": 5470}, {"loss": 1.0924, "grad_norm": 0.643738865852356, "learning_rate": 0.0002, "epoch": 3.9667028592110025, "step": 5480}, {"loss": 1.0607, "grad_norm": 0.682305634021759, "learning_rate": 0.0002, "epoch": 3.973941368078176, "step": 5490}, {"loss": 1.2908, "grad_norm": 1.18373441696167, "learning_rate": 0.0002, "epoch": 3.9811798769453492, "step": 5500}, {"loss": 1.0889, "grad_norm": 0.7190203070640564, "learning_rate": 0.0002, "epoch": 3.9884183858125226, "step": 5510}, {"loss": 1.2745, "grad_norm": 0.7516948580741882, "learning_rate": 0.0002, "epoch": 3.995656894679696, "step": 5520}, {"eval_loss": 1.4252897500991821, "eval_runtime": 27.235, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 5526}, {"loss": 1.0088, "grad_norm": 0.6353074312210083, "learning_rate": 0.0002, "epoch": 4.002895403546869, "step": 5530}, {"loss": 1.0326, "grad_norm": 0.7424906492233276, "learning_rate": 0.0002, "epoch": 4.010133912414043, "step": 5540}, {"loss": 1.0667, "grad_norm": 0.8856638073921204, "learning_rate": 0.0002, "epoch": 4.017372421281216, "step": 5550}, {"loss": 1.0905, "grad_norm": 0.9627974033355713, "learning_rate": 0.0002, "epoch": 4.024610930148389, "step": 5560}, {"loss": 1.0965, "grad_norm": 0.9048978686332703, "learning_rate": 0.0002, "epoch": 4.031849439015563, "step": 5570}, {"loss": 1.1108, "grad_norm": 0.921119213104248, "learning_rate": 0.0002, "epoch": 4.039087947882736, "step": 5580}, {"loss": 1.1235, "grad_norm": 0.8654361963272095, "learning_rate": 0.0002, "epoch": 4.0463264567499095, "step": 5590}, {"loss": 1.0794, "grad_norm": 0.7947945594787598, "learning_rate": 0.0002, "epoch": 4.053564965617083, "step": 5600}, {"loss": 1.0674, "grad_norm": 0.8307326436042786, "learning_rate": 0.0002, "epoch": 4.060803474484256, "step": 5610}, {"loss": 1.0076, "grad_norm": 0.793273389339447, "learning_rate": 0.0002, "epoch": 4.06804198335143, "step": 5620}, {"loss": 1.0651, "grad_norm": 0.8748673796653748, "learning_rate": 0.0002, "epoch": 4.075280492218603, "step": 5630}, {"loss": 1.111, "grad_norm": 0.7926856279373169, "learning_rate": 0.0002, "epoch": 4.082519001085776, "step": 5640}, {"loss": 1.044, "grad_norm": 0.922645092010498, "learning_rate": 0.0002, "epoch": 4.08975750995295, "step": 5650}, {"loss": 1.109, "grad_norm": 0.9539641737937927, "learning_rate": 0.0002, "epoch": 4.096996018820123, "step": 5660}, {"loss": 1.0788, "grad_norm": 0.8674443364143372, "learning_rate": 0.0002, "epoch": 4.1042345276872965, "step": 5670}, {"loss": 0.9867, "grad_norm": 0.7097609043121338, "learning_rate": 0.0002, "epoch": 4.11147303655447, "step": 5680}, {"loss": 1.1154, "grad_norm": 0.8875522613525391, "learning_rate": 0.0002, "epoch": 4.118711545421643, "step": 5690}, {"loss": 1.1217, "grad_norm": 0.8583634495735168, "learning_rate": 0.0002, "epoch": 4.125950054288817, "step": 5700}, {"loss": 1.0973, "grad_norm": 0.6736377477645874, "learning_rate": 0.0002, "epoch": 4.13318856315599, "step": 5710}, {"loss": 1.1199, "grad_norm": 0.9349062442779541, "learning_rate": 0.0002, "epoch": 4.140427072023163, "step": 5720}, {"loss": 1.0508, "grad_norm": 1.0610365867614746, "learning_rate": 0.0002, "epoch": 4.147665580890337, "step": 5730}, {"loss": 1.1146, "grad_norm": 1.5838189125061035, "learning_rate": 0.0002, "epoch": 4.15490408975751, "step": 5740}, {"loss": 1.0222, "grad_norm": 0.747522234916687, "learning_rate": 0.0002, "epoch": 4.162142598624683, "step": 5750}, {"loss": 1.1328, "grad_norm": 1.3247915506362915, "learning_rate": 0.0002, "epoch": 4.169381107491857, "step": 5760}, {"loss": 1.1655, "grad_norm": 0.8750247955322266, "learning_rate": 0.0002, "epoch": 4.17661961635903, "step": 5770}, {"loss": 1.199, "grad_norm": 0.7914144992828369, "learning_rate": 0.0002, "epoch": 4.1838581252262035, "step": 5780}, {"loss": 1.1213, "grad_norm": 0.9493299126625061, "learning_rate": 0.0002, "epoch": 4.191096634093377, "step": 5790}, {"loss": 1.1515, "grad_norm": 0.7802295088768005, "learning_rate": 0.0002, "epoch": 4.19833514296055, "step": 5800}, {"loss": 1.0704, "grad_norm": 0.6987314820289612, "learning_rate": 0.0002, "epoch": 4.205573651827724, "step": 5810}, {"loss": 1.1699, "grad_norm": 0.9220341444015503, "learning_rate": 0.0002, "epoch": 4.212812160694897, "step": 5820}, {"loss": 1.1394, "grad_norm": 0.8932939767837524, "learning_rate": 0.0002, "epoch": 4.22005066956207, "step": 5830}, {"loss": 1.0048, "grad_norm": 0.920002818107605, "learning_rate": 0.0002, "epoch": 4.227289178429244, "step": 5840}, {"loss": 0.964, "grad_norm": 0.6662752032279968, "learning_rate": 0.0002, "epoch": 4.234527687296417, "step": 5850}, {"loss": 0.986, "grad_norm": 0.8679718971252441, "learning_rate": 0.0002, "epoch": 4.24176619616359, "step": 5860}, {"loss": 0.8991, "grad_norm": 0.7020887732505798, "learning_rate": 0.0002, "epoch": 4.249004705030764, "step": 5870}, {"loss": 1.1132, "grad_norm": 0.869611382484436, "learning_rate": 0.0002, "epoch": 4.256243213897937, "step": 5880}, {"loss": 1.1026, "grad_norm": 0.7796585559844971, "learning_rate": 0.0002, "epoch": 4.2634817227651105, "step": 5890}, {"loss": 1.0957, "grad_norm": 0.8978819251060486, "learning_rate": 0.0002, "epoch": 4.270720231632284, "step": 5900}, {"loss": 1.1325, "grad_norm": 1.0837205648422241, "learning_rate": 0.0002, "epoch": 4.277958740499457, "step": 5910}, {"loss": 1.1279, "grad_norm": 0.7584353089332581, "learning_rate": 0.0002, "epoch": 4.285197249366631, "step": 5920}, {"loss": 1.0513, "grad_norm": 0.7313185334205627, "learning_rate": 0.0002, "epoch": 4.292435758233804, "step": 5930}, {"loss": 1.1101, "grad_norm": 0.8004671335220337, "learning_rate": 0.0002, "epoch": 4.299674267100977, "step": 5940}, {"loss": 1.14, "grad_norm": 2.154958724975586, "learning_rate": 0.0002, "epoch": 4.306912775968151, "step": 5950}, {"loss": 1.1206, "grad_norm": 0.9163479804992676, "learning_rate": 0.0002, "epoch": 4.314151284835324, "step": 5960}, {"loss": 0.9941, "grad_norm": 0.9151589274406433, "learning_rate": 0.0002, "epoch": 4.321389793702497, "step": 5970}, {"loss": 1.0606, "grad_norm": 0.8624112010002136, "learning_rate": 0.0002, "epoch": 4.328628302569671, "step": 5980}, {"loss": 1.1625, "grad_norm": 0.9357741475105286, "learning_rate": 0.0002, "epoch": 4.335866811436844, "step": 5990}, {"loss": 1.0712, "grad_norm": 1.3482335805892944, "learning_rate": 0.0002, "epoch": 4.3431053203040175, "step": 6000}, {"loss": 1.1224, "grad_norm": 0.7156149744987488, "learning_rate": 0.0002, "epoch": 4.350343829171191, "step": 6010}, {"loss": 1.0753, "grad_norm": 0.8480049967765808, "learning_rate": 0.0002, "epoch": 4.357582338038364, "step": 6020}, {"loss": 1.051, "grad_norm": 0.8262244462966919, "learning_rate": 0.0002, "epoch": 4.364820846905538, "step": 6030}, {"loss": 0.9966, "grad_norm": 0.7733905911445618, "learning_rate": 0.0002, "epoch": 4.372059355772711, "step": 6040}, {"loss": 1.1008, "grad_norm": 0.8553919792175293, "learning_rate": 0.0002, "epoch": 4.379297864639884, "step": 6050}, {"loss": 1.1777, "grad_norm": 0.8666832447052002, "learning_rate": 0.0002, "epoch": 4.386536373507058, "step": 6060}, {"loss": 1.1934, "grad_norm": 0.9168295860290527, "learning_rate": 0.0002, "epoch": 4.393774882374231, "step": 6070}, {"loss": 1.0988, "grad_norm": 0.7315238118171692, "learning_rate": 0.0002, "epoch": 4.4010133912414044, "step": 6080}, {"loss": 1.1599, "grad_norm": 1.020263433456421, "learning_rate": 0.0002, "epoch": 4.408251900108578, "step": 6090}, {"loss": 1.133, "grad_norm": 0.9978243708610535, "learning_rate": 0.0002, "epoch": 4.415490408975751, "step": 6100}, {"loss": 1.1324, "grad_norm": 0.995453953742981, "learning_rate": 0.0002, "epoch": 4.4227289178429245, "step": 6110}, {"loss": 1.0957, "grad_norm": 0.9360884428024292, "learning_rate": 0.0002, "epoch": 4.429967426710098, "step": 6120}, {"loss": 0.9506, "grad_norm": 0.8099448084831238, "learning_rate": 0.0002, "epoch": 4.437205935577271, "step": 6130}, {"loss": 1.0887, "grad_norm": 0.8173841238021851, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6140}, {"loss": 1.1219, "grad_norm": 0.7972666025161743, "learning_rate": 0.0002, "epoch": 4.451682953311618, "step": 6150}, {"loss": 1.0226, "grad_norm": 0.7685779333114624, "learning_rate": 0.0002, "epoch": 4.458921462178791, "step": 6160}, {"loss": 1.0732, "grad_norm": 0.7872623801231384, "learning_rate": 0.0002, "epoch": 4.466159971045965, "step": 6170}, {"loss": 0.9911, "grad_norm": 0.7677070498466492, "learning_rate": 0.0002, "epoch": 4.473398479913138, "step": 6180}, {"loss": 1.0919, "grad_norm": 0.7878316044807434, "learning_rate": 0.0002, "epoch": 4.4806369887803115, "step": 6190}, {"loss": 1.018, "grad_norm": 0.8178079724311829, "learning_rate": 0.0002, "epoch": 4.487875497647485, "step": 6200}, {"loss": 1.0517, "grad_norm": 1.2820082902908325, "learning_rate": 0.0002, "epoch": 4.495114006514658, "step": 6210}, {"loss": 1.3101, "grad_norm": 0.9380832314491272, "learning_rate": 0.0002, "epoch": 4.502352515381832, "step": 6220}, {"loss": 0.9818, "grad_norm": 0.7810422778129578, "learning_rate": 0.0002, "epoch": 4.509591024249005, "step": 6230}, {"loss": 1.1677, "grad_norm": 1.1022917032241821, "learning_rate": 0.0002, "epoch": 4.516829533116178, "step": 6240}, {"loss": 1.1579, "grad_norm": 1.4275553226470947, "learning_rate": 0.0002, "epoch": 4.524068041983352, "step": 6250}, {"loss": 1.3237, "grad_norm": 0.7597777247428894, "learning_rate": 0.0002, "epoch": 4.531306550850525, "step": 6260}, {"loss": 1.1529, "grad_norm": 1.10992431640625, "learning_rate": 0.0002, "epoch": 4.538545059717698, "step": 6270}, {"loss": 1.0732, "grad_norm": 0.8981178998947144, "learning_rate": 0.0002, "epoch": 4.545783568584872, "step": 6280}, {"loss": 1.086, "grad_norm": 0.7863979339599609, "learning_rate": 0.0002, "epoch": 4.553022077452045, "step": 6290}, {"loss": 1.2008, "grad_norm": 0.9071474671363831, "learning_rate": 0.0002, "epoch": 4.5602605863192185, "step": 6300}, {"loss": 1.0916, "grad_norm": 0.7429424524307251, "learning_rate": 0.0002, "epoch": 4.567499095186392, "step": 6310}, {"loss": 1.095, "grad_norm": 1.0767850875854492, "learning_rate": 0.0002, "epoch": 4.574737604053565, "step": 6320}, {"loss": 1.1023, "grad_norm": 0.7885915637016296, "learning_rate": 0.0002, "epoch": 4.581976112920739, "step": 6330}, {"loss": 1.1131, "grad_norm": 0.8350457549095154, "learning_rate": 0.0002, "epoch": 4.589214621787912, "step": 6340}, {"loss": 1.0743, "grad_norm": 0.7853530645370483, "learning_rate": 0.0002, "epoch": 4.596453130655085, "step": 6350}, {"loss": 1.1912, "grad_norm": 1.1220661401748657, "learning_rate": 0.0002, "epoch": 4.603691639522259, "step": 6360}, {"loss": 1.0927, "grad_norm": 0.7959423065185547, "learning_rate": 0.0002, "epoch": 4.610930148389432, "step": 6370}, {"loss": 1.1542, "grad_norm": 0.7782652378082275, "learning_rate": 0.0002, "epoch": 4.618168657256605, "step": 6380}, {"loss": 1.0753, "grad_norm": 0.7882203459739685, "learning_rate": 0.0002, "epoch": 4.625407166123779, "step": 6390}, {"loss": 1.0676, "grad_norm": 0.8841899037361145, "learning_rate": 0.0002, "epoch": 4.632645674990952, "step": 6400}, {"loss": 1.0815, "grad_norm": 0.7936127781867981, "learning_rate": 0.0002, "epoch": 4.6398841838581255, "step": 6410}, {"loss": 1.0198, "grad_norm": 0.9213966131210327, "learning_rate": 0.0002, "epoch": 4.647122692725299, "step": 6420}, {"loss": 0.9872, "grad_norm": 0.9246473908424377, "learning_rate": 0.0002, "epoch": 4.654361201592472, "step": 6430}, {"loss": 1.1309, "grad_norm": 0.766572892665863, "learning_rate": 0.0002, "epoch": 4.661599710459646, "step": 6440}, {"loss": 1.1095, "grad_norm": 0.8596171736717224, "learning_rate": 0.0002, "epoch": 4.668838219326819, "step": 6450}, {"loss": 1.1869, "grad_norm": 0.8482751846313477, "learning_rate": 0.0002, "epoch": 4.676076728193992, "step": 6460}, {"loss": 1.0622, "grad_norm": 1.0826905965805054, "learning_rate": 0.0002, "epoch": 4.683315237061166, "step": 6470}, {"loss": 1.0256, "grad_norm": 1.1048457622528076, "learning_rate": 0.0002, "epoch": 4.690553745928339, "step": 6480}, {"loss": 1.0514, "grad_norm": 0.9429134726524353, "learning_rate": 0.0002, "epoch": 4.697792254795512, "step": 6490}, {"loss": 1.1351, "grad_norm": 0.8587502837181091, "learning_rate": 0.0002, "epoch": 4.705030763662686, "step": 6500}, {"loss": 1.0969, "grad_norm": 1.0387083292007446, "learning_rate": 0.0002, "epoch": 4.712269272529859, "step": 6510}, {"loss": 1.0493, "grad_norm": 0.7471951842308044, "learning_rate": 0.0002, "epoch": 4.7195077813970325, "step": 6520}, {"loss": 1.2632, "grad_norm": 0.8800424933433533, "learning_rate": 0.0002, "epoch": 4.726746290264206, "step": 6530}, {"loss": 1.2126, "grad_norm": 0.8136811852455139, "learning_rate": 0.0002, "epoch": 4.733984799131379, "step": 6540}, {"loss": 1.195, "grad_norm": 0.9910339713096619, "learning_rate": 0.0002, "epoch": 4.741223307998553, "step": 6550}, {"loss": 1.1201, "grad_norm": 1.0679163932800293, "learning_rate": 0.0002, "epoch": 4.748461816865726, "step": 6560}, {"loss": 1.0297, "grad_norm": 0.8468248248100281, "learning_rate": 0.0002, "epoch": 4.755700325732899, "step": 6570}, {"loss": 1.0858, "grad_norm": 0.8771235942840576, "learning_rate": 0.0002, "epoch": 4.762938834600073, "step": 6580}, {"loss": 1.077, "grad_norm": 0.7024846076965332, "learning_rate": 0.0002, "epoch": 4.770177343467246, "step": 6590}, {"loss": 1.0876, "grad_norm": 0.7836683392524719, "learning_rate": 0.0002, "epoch": 4.7774158523344195, "step": 6600}, {"loss": 1.1006, "grad_norm": 0.7717288136482239, "learning_rate": 0.0002, "epoch": 4.784654361201593, "step": 6610}, {"loss": 1.0376, "grad_norm": 0.884183943271637, "learning_rate": 0.0002, "epoch": 4.791892870068766, "step": 6620}, {"loss": 1.1757, "grad_norm": 1.383867621421814, "learning_rate": 0.0002, "epoch": 4.7991313789359396, "step": 6630}, {"loss": 1.0861, "grad_norm": 0.9741523861885071, "learning_rate": 0.0002, "epoch": 4.806369887803113, "step": 6640}, {"loss": 1.0884, "grad_norm": 0.9723693132400513, "learning_rate": 0.0002, "epoch": 4.813608396670286, "step": 6650}, {"loss": 1.2203, "grad_norm": 1.8324809074401855, "learning_rate": 0.0002, "epoch": 4.82084690553746, "step": 6660}, {"loss": 1.0292, "grad_norm": 0.904909074306488, "learning_rate": 0.0002, "epoch": 4.828085414404633, "step": 6670}, {"loss": 1.0349, "grad_norm": 0.7355411648750305, "learning_rate": 0.0002, "epoch": 4.835323923271806, "step": 6680}, {"loss": 1.0793, "grad_norm": 0.8934960961341858, "learning_rate": 0.0002, "epoch": 4.84256243213898, "step": 6690}, {"loss": 1.0375, "grad_norm": 1.4596954584121704, "learning_rate": 0.0002, "epoch": 4.849800941006153, "step": 6700}, {"loss": 1.1065, "grad_norm": 0.8310341238975525, "learning_rate": 0.0002, "epoch": 4.8570394498733265, "step": 6710}, {"loss": 1.1089, "grad_norm": 0.9709894061088562, "learning_rate": 0.0002, "epoch": 4.8642779587405, "step": 6720}, {"loss": 1.0069, "grad_norm": 0.852142333984375, "learning_rate": 0.0002, "epoch": 4.871516467607673, "step": 6730}, {"loss": 1.0507, "grad_norm": 1.0643625259399414, "learning_rate": 0.0002, "epoch": 4.878754976474847, "step": 6740}, {"loss": 1.056, "grad_norm": 0.9419508576393127, "learning_rate": 0.0002, "epoch": 4.88599348534202, "step": 6750}, {"loss": 1.1995, "grad_norm": 1.1818498373031616, "learning_rate": 0.0002, "epoch": 4.893231994209193, "step": 6760}, {"loss": 1.0925, "grad_norm": 0.9369569420814514, "learning_rate": 0.0002, "epoch": 4.900470503076367, "step": 6770}, {"loss": 1.1648, "grad_norm": 0.7012579441070557, "learning_rate": 0.0002, "epoch": 4.90770901194354, "step": 6780}, {"loss": 1.0926, "grad_norm": 0.9109319448471069, "learning_rate": 0.0002, "epoch": 4.914947520810713, "step": 6790}, {"loss": 1.0358, "grad_norm": 0.8077534437179565, "learning_rate": 0.0002, "epoch": 4.922186029677887, "step": 6800}, {"loss": 1.2549, "grad_norm": 0.7571148872375488, "learning_rate": 0.0002, "epoch": 4.92942453854506, "step": 6810}, {"loss": 0.9638, "grad_norm": 0.7325633764266968, "learning_rate": 0.0002, "epoch": 4.9366630474122335, "step": 6820}, {"loss": 1.0128, "grad_norm": 0.8465084433555603, "learning_rate": 0.0002, "epoch": 4.943901556279407, "step": 6830}, {"loss": 1.153, "grad_norm": 0.8753737807273865, "learning_rate": 0.0002, "epoch": 4.95114006514658, "step": 6840}, {"loss": 1.0247, "grad_norm": 0.9421748518943787, "learning_rate": 0.0002, "epoch": 4.958378574013754, "step": 6850}, {"loss": 1.1483, "grad_norm": 0.8245896697044373, "learning_rate": 0.0002, "epoch": 4.965617082880927, "step": 6860}, {"loss": 0.9905, "grad_norm": 0.8823089599609375, "learning_rate": 0.0002, "epoch": 4.9728555917481, "step": 6870}, {"loss": 1.1664, "grad_norm": 0.8406389355659485, "learning_rate": 0.0002, "epoch": 4.980094100615274, "step": 6880}, {"loss": 1.0944, "grad_norm": 0.9732868075370789, "learning_rate": 0.0002, "epoch": 4.987332609482447, "step": 6890}, {"loss": 1.1776, "grad_norm": 2.125141143798828, "learning_rate": 0.0002, "epoch": 4.99457111834962, "step": 6900}]} +{"epoch": 6.0, "step": 8289, "epoch_duration": 1227.6048402786255, "total_accumulated_duration": 7895.64093542099, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}, {"eval_loss": 1.4217946529388428, "eval_runtime": 27.1596, "eval_samples_per_second": 16.053, "eval_steps_per_second": 2.025, "epoch": 2.9996380745566413, "step": 4144}, {"loss": 1.2152, "grad_norm": 0.7043836116790771, "learning_rate": 0.0002, "epoch": 3.0039811798769454, "step": 4150}, {"loss": 1.1664, "grad_norm": 0.6806283593177795, "learning_rate": 0.0002, "epoch": 3.0112196887441187, "step": 4160}, {"loss": 1.292, "grad_norm": 0.7684550285339355, "learning_rate": 0.0002, "epoch": 3.018458197611292, "step": 4170}, {"loss": 1.3467, "grad_norm": 0.7895237803459167, "learning_rate": 0.0002, "epoch": 3.0256967064784654, "step": 4180}, {"loss": 1.1324, "grad_norm": 0.7464531064033508, "learning_rate": 0.0002, "epoch": 3.032935215345639, "step": 4190}, {"loss": 1.1614, "grad_norm": 0.9358500838279724, "learning_rate": 0.0002, "epoch": 3.040173724212812, "step": 4200}, {"loss": 1.1834, "grad_norm": 1.1066628694534302, "learning_rate": 0.0002, "epoch": 3.0474122330799855, "step": 4210}, {"loss": 1.1557, "grad_norm": 0.6663267612457275, "learning_rate": 0.0002, "epoch": 3.054650741947159, "step": 4220}, {"loss": 1.1707, "grad_norm": 0.6669464707374573, "learning_rate": 0.0002, "epoch": 3.0618892508143323, "step": 4230}, {"loss": 1.1841, "grad_norm": 0.7052164077758789, "learning_rate": 0.0002, "epoch": 3.0691277596815056, "step": 4240}, {"loss": 1.2913, "grad_norm": 0.6118432879447937, "learning_rate": 0.0002, "epoch": 3.076366268548679, "step": 4250}, {"loss": 1.1526, "grad_norm": 0.6915903687477112, "learning_rate": 0.0002, "epoch": 3.0836047774158524, "step": 4260}, {"loss": 1.1348, "grad_norm": 0.7441644668579102, "learning_rate": 0.0002, "epoch": 3.0908432862830257, "step": 4270}, {"loss": 1.1672, "grad_norm": 0.823850691318512, "learning_rate": 0.0002, "epoch": 3.098081795150199, "step": 4280}, {"loss": 1.2655, "grad_norm": 0.9677883386611938, "learning_rate": 0.0002, "epoch": 3.1053203040173725, "step": 4290}, {"loss": 1.1794, "grad_norm": 0.7002579569816589, "learning_rate": 0.0002, "epoch": 3.112558812884546, "step": 4300}, {"loss": 1.135, "grad_norm": 0.778789758682251, "learning_rate": 0.0002, "epoch": 3.119797321751719, "step": 4310}, {"loss": 1.0818, "grad_norm": 0.7236007452011108, "learning_rate": 0.0002, "epoch": 3.1270358306188926, "step": 4320}, {"loss": 1.1803, "grad_norm": 0.8809133768081665, "learning_rate": 0.0002, "epoch": 3.134274339486066, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.7924913167953491, "learning_rate": 0.0002, "epoch": 3.1415128483532393, "step": 4340}, {"loss": 1.1413, "grad_norm": 0.7437422275543213, "learning_rate": 0.0002, "epoch": 3.1487513572204127, "step": 4350}, {"loss": 1.2088, "grad_norm": 0.6428450345993042, "learning_rate": 0.0002, "epoch": 3.155989866087586, "step": 4360}, {"loss": 1.3032, "grad_norm": 0.7922873497009277, "learning_rate": 0.0002, "epoch": 3.1632283749547594, "step": 4370}, {"loss": 1.216, "grad_norm": 0.5252506732940674, "learning_rate": 0.0002, "epoch": 3.1704668838219328, "step": 4380}, {"loss": 1.1297, "grad_norm": 0.8570457696914673, "learning_rate": 0.0002, "epoch": 3.177705392689106, "step": 4390}, {"loss": 1.0994, "grad_norm": 0.7218987345695496, "learning_rate": 0.0002, "epoch": 3.1849439015562795, "step": 4400}, {"loss": 1.2891, "grad_norm": 0.6921393275260925, "learning_rate": 0.0002, "epoch": 3.192182410423453, "step": 4410}, {"loss": 1.2668, "grad_norm": 0.7386137843132019, "learning_rate": 0.0002, "epoch": 3.199420919290626, "step": 4420}, {"loss": 1.1654, "grad_norm": 0.6227759122848511, "learning_rate": 0.0002, "epoch": 3.2066594281577996, "step": 4430}, {"loss": 1.1752, "grad_norm": 0.7180278897285461, "learning_rate": 0.0002, "epoch": 3.213897937024973, "step": 4440}, {"loss": 1.1757, "grad_norm": 0.745830774307251, "learning_rate": 0.0002, "epoch": 3.2211364458921463, "step": 4450}, {"loss": 1.234, "grad_norm": 0.6766072511672974, "learning_rate": 0.0002, "epoch": 3.2283749547593197, "step": 4460}, {"loss": 1.1999, "grad_norm": 0.8325067162513733, "learning_rate": 0.0002, "epoch": 3.235613463626493, "step": 4470}, {"loss": 1.1606, "grad_norm": 0.7148305177688599, "learning_rate": 0.0002, "epoch": 3.2428519724936664, "step": 4480}, {"loss": 1.1383, "grad_norm": 0.7752676010131836, "learning_rate": 0.0002, "epoch": 3.25009048136084, "step": 4490}, {"loss": 1.3006, "grad_norm": 0.6776860952377319, "learning_rate": 0.0002, "epoch": 3.257328990228013, "step": 4500}, {"loss": 1.0796, "grad_norm": 0.704359769821167, "learning_rate": 0.0002, "epoch": 3.2645674990951865, "step": 4510}, {"loss": 1.2496, "grad_norm": 0.6880282163619995, "learning_rate": 0.0002, "epoch": 3.27180600796236, "step": 4520}, {"loss": 1.0947, "grad_norm": 0.8179270029067993, "learning_rate": 0.0002, "epoch": 3.2790445168295332, "step": 4530}, {"loss": 1.1909, "grad_norm": 0.6718448996543884, "learning_rate": 0.0002, "epoch": 3.2862830256967066, "step": 4540}, {"loss": 1.2708, "grad_norm": 0.8300657868385315, "learning_rate": 0.0002, "epoch": 3.29352153456388, "step": 4550}, {"loss": 1.2594, "grad_norm": 0.6433690786361694, "learning_rate": 0.0002, "epoch": 3.3007600434310533, "step": 4560}, {"loss": 1.2479, "grad_norm": 0.690262496471405, "learning_rate": 0.0002, "epoch": 3.3079985522982267, "step": 4570}, {"loss": 1.1342, "grad_norm": 0.7022852301597595, "learning_rate": 0.0002, "epoch": 3.3152370611654, "step": 4580}, {"loss": 1.0844, "grad_norm": 0.6438387632369995, "learning_rate": 0.0002, "epoch": 3.3224755700325734, "step": 4590}, {"loss": 1.17, "grad_norm": 0.6866899132728577, "learning_rate": 0.0002, "epoch": 3.329714078899747, "step": 4600}, {"loss": 1.1289, "grad_norm": 0.8233968019485474, "learning_rate": 0.0002, "epoch": 3.33695258776692, "step": 4610}, {"loss": 1.1855, "grad_norm": 0.7251574993133545, "learning_rate": 0.0002, "epoch": 3.3441910966340935, "step": 4620}, {"loss": 1.3403, "grad_norm": 0.7855110168457031, "learning_rate": 0.0002, "epoch": 3.351429605501267, "step": 4630}, {"loss": 1.2922, "grad_norm": 0.8487356305122375, "learning_rate": 0.0002, "epoch": 3.3586681143684403, "step": 4640}, {"loss": 1.2462, "grad_norm": 0.6429011225700378, "learning_rate": 0.0002, "epoch": 3.3659066232356136, "step": 4650}, {"loss": 1.129, "grad_norm": 0.7095270156860352, "learning_rate": 0.0002, "epoch": 3.373145132102787, "step": 4660}, {"loss": 1.262, "grad_norm": 0.6792303323745728, "learning_rate": 0.0002, "epoch": 3.3803836409699604, "step": 4670}, {"loss": 1.256, "grad_norm": 0.6784825921058655, "learning_rate": 0.0002, "epoch": 3.3876221498371337, "step": 4680}, {"loss": 1.0838, "grad_norm": 0.6362888216972351, "learning_rate": 0.0002, "epoch": 3.394860658704307, "step": 4690}, {"loss": 1.2165, "grad_norm": 0.7794778943061829, "learning_rate": 0.0002, "epoch": 3.4020991675714805, "step": 4700}, {"loss": 1.0644, "grad_norm": 0.7287485003471375, "learning_rate": 0.0002, "epoch": 3.409337676438654, "step": 4710}, {"loss": 1.2925, "grad_norm": 0.6481451392173767, "learning_rate": 0.0002, "epoch": 3.416576185305827, "step": 4720}, {"loss": 1.2121, "grad_norm": 0.9200371503829956, "learning_rate": 0.0002, "epoch": 3.4238146941730006, "step": 4730}, {"loss": 1.072, "grad_norm": 1.074180245399475, "learning_rate": 0.0002, "epoch": 3.431053203040174, "step": 4740}, {"loss": 1.0421, "grad_norm": 0.6722986698150635, "learning_rate": 0.0002, "epoch": 3.438291711907347, "step": 4750}, {"loss": 1.2258, "grad_norm": 0.7945933938026428, "learning_rate": 0.0002, "epoch": 3.44553022077452, "step": 4760}, {"loss": 1.0927, "grad_norm": 0.7624640464782715, "learning_rate": 0.0002, "epoch": 3.4527687296416936, "step": 4770}, {"loss": 1.2428, "grad_norm": 0.7763656377792358, "learning_rate": 0.0002, "epoch": 3.460007238508867, "step": 4780}, {"loss": 1.2584, "grad_norm": 0.7736947536468506, "learning_rate": 0.0002, "epoch": 3.4672457473760403, "step": 4790}, {"loss": 1.1953, "grad_norm": 0.8450354933738708, "learning_rate": 0.0002, "epoch": 3.4744842562432137, "step": 4800}, {"loss": 1.1362, "grad_norm": 0.6480133533477783, "learning_rate": 0.0002, "epoch": 3.481722765110387, "step": 4810}, {"loss": 1.1882, "grad_norm": 0.8437445759773254, "learning_rate": 0.0002, "epoch": 3.4889612739775604, "step": 4820}, {"loss": 1.1519, "grad_norm": 0.7781730890274048, "learning_rate": 0.0002, "epoch": 3.4961997828447338, "step": 4830}, {"loss": 1.1836, "grad_norm": 0.8523228168487549, "learning_rate": 0.0002, "epoch": 3.503438291711907, "step": 4840}, {"loss": 1.1672, "grad_norm": 0.6236732006072998, "learning_rate": 0.0002, "epoch": 3.5106768005790805, "step": 4850}, {"loss": 1.1926, "grad_norm": 0.7500787377357483, "learning_rate": 0.0002, "epoch": 3.517915309446254, "step": 4860}, {"loss": 1.1998, "grad_norm": 0.7665374875068665, "learning_rate": 0.0002, "epoch": 3.5251538183134272, "step": 4870}, {"loss": 1.1551, "grad_norm": 0.787857711315155, "learning_rate": 0.0002, "epoch": 3.5323923271806006, "step": 4880}, {"loss": 1.2758, "grad_norm": 0.970595121383667, "learning_rate": 0.0002, "epoch": 3.539630836047774, "step": 4890}, {"loss": 1.1274, "grad_norm": 0.6409347057342529, "learning_rate": 0.0002, "epoch": 3.5468693449149473, "step": 4900}, {"loss": 1.1596, "grad_norm": 0.888551652431488, "learning_rate": 0.0002, "epoch": 3.5541078537821207, "step": 4910}, {"loss": 1.1644, "grad_norm": 1.0808377265930176, "learning_rate": 0.0002, "epoch": 3.561346362649294, "step": 4920}, {"loss": 1.2564, "grad_norm": 0.7501053214073181, "learning_rate": 0.0002, "epoch": 3.5685848715164674, "step": 4930}, {"loss": 1.2351, "grad_norm": 0.7375240325927734, "learning_rate": 0.0002, "epoch": 3.575823380383641, "step": 4940}, {"loss": 1.3568, "grad_norm": 0.7075039744377136, "learning_rate": 0.0002, "epoch": 3.583061889250814, "step": 4950}, {"loss": 1.3355, "grad_norm": 0.939337432384491, "learning_rate": 0.0002, "epoch": 3.5903003981179875, "step": 4960}, {"loss": 1.1722, "grad_norm": 0.6717396974563599, "learning_rate": 0.0002, "epoch": 3.597538906985161, "step": 4970}, {"loss": 1.1186, "grad_norm": 0.7141643762588501, "learning_rate": 0.0002, "epoch": 3.6047774158523342, "step": 4980}, {"loss": 1.1011, "grad_norm": 0.7109216451644897, "learning_rate": 0.0002, "epoch": 3.6120159247195076, "step": 4990}, {"loss": 1.2178, "grad_norm": 0.7020776867866516, "learning_rate": 0.0002, "epoch": 3.619254433586681, "step": 5000}, {"loss": 1.1939, "grad_norm": 0.7158873677253723, "learning_rate": 0.0002, "epoch": 3.6264929424538543, "step": 5010}, {"loss": 1.2624, "grad_norm": 0.7062035202980042, "learning_rate": 0.0002, "epoch": 3.6337314513210277, "step": 5020}, {"loss": 1.0224, "grad_norm": 0.7081155776977539, "learning_rate": 0.0002, "epoch": 3.640969960188201, "step": 5030}, {"loss": 1.2195, "grad_norm": 1.2210607528686523, "learning_rate": 0.0002, "epoch": 3.6482084690553744, "step": 5040}, {"loss": 1.2596, "grad_norm": 0.6650236248970032, "learning_rate": 0.0002, "epoch": 3.655446977922548, "step": 5050}, {"loss": 1.1072, "grad_norm": 0.6884829998016357, "learning_rate": 0.0002, "epoch": 3.662685486789721, "step": 5060}, {"loss": 1.2292, "grad_norm": 0.7317819595336914, "learning_rate": 0.0002, "epoch": 3.6699239956568945, "step": 5070}, {"loss": 1.1917, "grad_norm": 0.7406691908836365, "learning_rate": 0.0002, "epoch": 3.677162504524068, "step": 5080}, {"loss": 1.2949, "grad_norm": 0.9009454250335693, "learning_rate": 0.0002, "epoch": 3.6844010133912413, "step": 5090}, {"loss": 1.1528, "grad_norm": 0.8189385533332825, "learning_rate": 0.0002, "epoch": 3.6916395222584146, "step": 5100}, {"loss": 1.3408, "grad_norm": 1.0793628692626953, "learning_rate": 0.0002, "epoch": 3.698878031125588, "step": 5110}, {"loss": 1.2417, "grad_norm": 0.8593027591705322, "learning_rate": 0.0002, "epoch": 3.7061165399927614, "step": 5120}, {"loss": 1.2141, "grad_norm": 0.8481812477111816, "learning_rate": 0.0002, "epoch": 3.7133550488599347, "step": 5130}, {"loss": 1.125, "grad_norm": 0.6527451276779175, "learning_rate": 0.0002, "epoch": 3.720593557727108, "step": 5140}, {"loss": 1.1584, "grad_norm": 0.9220114350318909, "learning_rate": 0.0002, "epoch": 3.7278320665942815, "step": 5150}, {"loss": 1.2267, "grad_norm": 1.0842019319534302, "learning_rate": 0.0002, "epoch": 3.735070575461455, "step": 5160}, {"loss": 1.3083, "grad_norm": 0.965453565120697, "learning_rate": 0.0002, "epoch": 3.742309084328628, "step": 5170}, {"loss": 1.1772, "grad_norm": 0.9903319478034973, "learning_rate": 0.0002, "epoch": 3.7495475931958016, "step": 5180}, {"loss": 1.2515, "grad_norm": 0.7434818148612976, "learning_rate": 0.0002, "epoch": 3.756786102062975, "step": 5190}, {"loss": 1.2631, "grad_norm": 0.6717280745506287, "learning_rate": 0.0002, "epoch": 3.7640246109301483, "step": 5200}, {"loss": 1.2012, "grad_norm": 0.7754665613174438, "learning_rate": 0.0002, "epoch": 3.7712631197973217, "step": 5210}, {"loss": 1.305, "grad_norm": 1.028374433517456, "learning_rate": 0.0002, "epoch": 3.778501628664495, "step": 5220}, {"loss": 1.1866, "grad_norm": 0.6026996374130249, "learning_rate": 0.0002, "epoch": 3.7857401375316684, "step": 5230}, {"loss": 1.1901, "grad_norm": 0.6978490948677063, "learning_rate": 0.0002, "epoch": 3.7929786463988417, "step": 5240}, {"loss": 1.2576, "grad_norm": 0.7303446531295776, "learning_rate": 0.0002, "epoch": 3.800217155266015, "step": 5250}, {"loss": 1.3173, "grad_norm": 1.0734210014343262, "learning_rate": 0.0002, "epoch": 3.8074556641331885, "step": 5260}, {"loss": 1.1137, "grad_norm": 0.6383201479911804, "learning_rate": 0.0002, "epoch": 3.814694173000362, "step": 5270}, {"loss": 1.0904, "grad_norm": 0.7742630243301392, "learning_rate": 0.0002, "epoch": 3.821932681867535, "step": 5280}, {"loss": 1.2232, "grad_norm": 0.8477074503898621, "learning_rate": 0.0002, "epoch": 3.8291711907347086, "step": 5290}, {"loss": 1.2047, "grad_norm": 0.6675317883491516, "learning_rate": 0.0002, "epoch": 3.836409699601882, "step": 5300}, {"loss": 1.2275, "grad_norm": 0.7515445351600647, "learning_rate": 0.0002, "epoch": 3.8436482084690553, "step": 5310}, {"loss": 1.2569, "grad_norm": 1.1441220045089722, "learning_rate": 0.0002, "epoch": 3.8508867173362287, "step": 5320}, {"loss": 1.1512, "grad_norm": 0.7968795895576477, "learning_rate": 0.0002, "epoch": 3.858125226203402, "step": 5330}, {"loss": 1.232, "grad_norm": 0.7842824459075928, "learning_rate": 0.0002, "epoch": 3.8653637350705754, "step": 5340}, {"loss": 1.1847, "grad_norm": 0.8272225260734558, "learning_rate": 0.0002, "epoch": 3.8726022439377488, "step": 5350}, {"loss": 1.1381, "grad_norm": 0.8413397669792175, "learning_rate": 0.0002, "epoch": 3.879840752804922, "step": 5360}, {"loss": 1.2349, "grad_norm": 1.141764760017395, "learning_rate": 0.0002, "epoch": 3.8870792616720955, "step": 5370}, {"loss": 1.212, "grad_norm": 0.9826975464820862, "learning_rate": 0.0002, "epoch": 3.894317770539269, "step": 5380}, {"loss": 1.1833, "grad_norm": 0.8598255515098572, "learning_rate": 0.0002, "epoch": 3.9015562794064422, "step": 5390}, {"loss": 1.1247, "grad_norm": 0.6271058320999146, "learning_rate": 0.0002, "epoch": 3.9087947882736156, "step": 5400}, {"loss": 1.2212, "grad_norm": 0.6379870772361755, "learning_rate": 0.0002, "epoch": 3.916033297140789, "step": 5410}, {"loss": 1.2481, "grad_norm": 1.0313376188278198, "learning_rate": 0.0002, "epoch": 3.9232718060079623, "step": 5420}, {"loss": 1.1872, "grad_norm": 0.8220619559288025, "learning_rate": 0.0002, "epoch": 3.9305103148751357, "step": 5430}, {"loss": 1.2006, "grad_norm": 0.7576116919517517, "learning_rate": 0.0002, "epoch": 3.937748823742309, "step": 5440}, {"loss": 1.1969, "grad_norm": 1.226235032081604, "learning_rate": 0.0002, "epoch": 3.9449873326094824, "step": 5450}, {"loss": 1.2945, "grad_norm": 0.7979229688644409, "learning_rate": 0.0002, "epoch": 3.952225841476656, "step": 5460}, {"loss": 1.1922, "grad_norm": 0.9911929965019226, "learning_rate": 0.0002, "epoch": 3.959464350343829, "step": 5470}, {"loss": 1.0924, "grad_norm": 0.643738865852356, "learning_rate": 0.0002, "epoch": 3.9667028592110025, "step": 5480}, {"loss": 1.0607, "grad_norm": 0.682305634021759, "learning_rate": 0.0002, "epoch": 3.973941368078176, "step": 5490}, {"loss": 1.2908, "grad_norm": 1.18373441696167, "learning_rate": 0.0002, "epoch": 3.9811798769453492, "step": 5500}, {"loss": 1.0889, "grad_norm": 0.7190203070640564, "learning_rate": 0.0002, "epoch": 3.9884183858125226, "step": 5510}, {"loss": 1.2745, "grad_norm": 0.7516948580741882, "learning_rate": 0.0002, "epoch": 3.995656894679696, "step": 5520}, {"eval_loss": 1.4252897500991821, "eval_runtime": 27.235, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 5526}, {"loss": 1.0088, "grad_norm": 0.6353074312210083, "learning_rate": 0.0002, "epoch": 4.002895403546869, "step": 5530}, {"loss": 1.0326, "grad_norm": 0.7424906492233276, "learning_rate": 0.0002, "epoch": 4.010133912414043, "step": 5540}, {"loss": 1.0667, "grad_norm": 0.8856638073921204, "learning_rate": 0.0002, "epoch": 4.017372421281216, "step": 5550}, {"loss": 1.0905, "grad_norm": 0.9627974033355713, "learning_rate": 0.0002, "epoch": 4.024610930148389, "step": 5560}, {"loss": 1.0965, "grad_norm": 0.9048978686332703, "learning_rate": 0.0002, "epoch": 4.031849439015563, "step": 5570}, {"loss": 1.1108, "grad_norm": 0.921119213104248, "learning_rate": 0.0002, "epoch": 4.039087947882736, "step": 5580}, {"loss": 1.1235, "grad_norm": 0.8654361963272095, "learning_rate": 0.0002, "epoch": 4.0463264567499095, "step": 5590}, {"loss": 1.0794, "grad_norm": 0.7947945594787598, "learning_rate": 0.0002, "epoch": 4.053564965617083, "step": 5600}, {"loss": 1.0674, "grad_norm": 0.8307326436042786, "learning_rate": 0.0002, "epoch": 4.060803474484256, "step": 5610}, {"loss": 1.0076, "grad_norm": 0.793273389339447, "learning_rate": 0.0002, "epoch": 4.06804198335143, "step": 5620}, {"loss": 1.0651, "grad_norm": 0.8748673796653748, "learning_rate": 0.0002, "epoch": 4.075280492218603, "step": 5630}, {"loss": 1.111, "grad_norm": 0.7926856279373169, "learning_rate": 0.0002, "epoch": 4.082519001085776, "step": 5640}, {"loss": 1.044, "grad_norm": 0.922645092010498, "learning_rate": 0.0002, "epoch": 4.08975750995295, "step": 5650}, {"loss": 1.109, "grad_norm": 0.9539641737937927, "learning_rate": 0.0002, "epoch": 4.096996018820123, "step": 5660}, {"loss": 1.0788, "grad_norm": 0.8674443364143372, "learning_rate": 0.0002, "epoch": 4.1042345276872965, "step": 5670}, {"loss": 0.9867, "grad_norm": 0.7097609043121338, "learning_rate": 0.0002, "epoch": 4.11147303655447, "step": 5680}, {"loss": 1.1154, "grad_norm": 0.8875522613525391, "learning_rate": 0.0002, "epoch": 4.118711545421643, "step": 5690}, {"loss": 1.1217, "grad_norm": 0.8583634495735168, "learning_rate": 0.0002, "epoch": 4.125950054288817, "step": 5700}, {"loss": 1.0973, "grad_norm": 0.6736377477645874, "learning_rate": 0.0002, "epoch": 4.13318856315599, "step": 5710}, {"loss": 1.1199, "grad_norm": 0.9349062442779541, "learning_rate": 0.0002, "epoch": 4.140427072023163, "step": 5720}, {"loss": 1.0508, "grad_norm": 1.0610365867614746, "learning_rate": 0.0002, "epoch": 4.147665580890337, "step": 5730}, {"loss": 1.1146, "grad_norm": 1.5838189125061035, "learning_rate": 0.0002, "epoch": 4.15490408975751, "step": 5740}, {"loss": 1.0222, "grad_norm": 0.747522234916687, "learning_rate": 0.0002, "epoch": 4.162142598624683, "step": 5750}, {"loss": 1.1328, "grad_norm": 1.3247915506362915, "learning_rate": 0.0002, "epoch": 4.169381107491857, "step": 5760}, {"loss": 1.1655, "grad_norm": 0.8750247955322266, "learning_rate": 0.0002, "epoch": 4.17661961635903, "step": 5770}, {"loss": 1.199, "grad_norm": 0.7914144992828369, "learning_rate": 0.0002, "epoch": 4.1838581252262035, "step": 5780}, {"loss": 1.1213, "grad_norm": 0.9493299126625061, "learning_rate": 0.0002, "epoch": 4.191096634093377, "step": 5790}, {"loss": 1.1515, "grad_norm": 0.7802295088768005, "learning_rate": 0.0002, "epoch": 4.19833514296055, "step": 5800}, {"loss": 1.0704, "grad_norm": 0.6987314820289612, "learning_rate": 0.0002, "epoch": 4.205573651827724, "step": 5810}, {"loss": 1.1699, "grad_norm": 0.9220341444015503, "learning_rate": 0.0002, "epoch": 4.212812160694897, "step": 5820}, {"loss": 1.1394, "grad_norm": 0.8932939767837524, "learning_rate": 0.0002, "epoch": 4.22005066956207, "step": 5830}, {"loss": 1.0048, "grad_norm": 0.920002818107605, "learning_rate": 0.0002, "epoch": 4.227289178429244, "step": 5840}, {"loss": 0.964, "grad_norm": 0.6662752032279968, "learning_rate": 0.0002, "epoch": 4.234527687296417, "step": 5850}, {"loss": 0.986, "grad_norm": 0.8679718971252441, "learning_rate": 0.0002, "epoch": 4.24176619616359, "step": 5860}, {"loss": 0.8991, "grad_norm": 0.7020887732505798, "learning_rate": 0.0002, "epoch": 4.249004705030764, "step": 5870}, {"loss": 1.1132, "grad_norm": 0.869611382484436, "learning_rate": 0.0002, "epoch": 4.256243213897937, "step": 5880}, {"loss": 1.1026, "grad_norm": 0.7796585559844971, "learning_rate": 0.0002, "epoch": 4.2634817227651105, "step": 5890}, {"loss": 1.0957, "grad_norm": 0.8978819251060486, "learning_rate": 0.0002, "epoch": 4.270720231632284, "step": 5900}, {"loss": 1.1325, "grad_norm": 1.0837205648422241, "learning_rate": 0.0002, "epoch": 4.277958740499457, "step": 5910}, {"loss": 1.1279, "grad_norm": 0.7584353089332581, "learning_rate": 0.0002, "epoch": 4.285197249366631, "step": 5920}, {"loss": 1.0513, "grad_norm": 0.7313185334205627, "learning_rate": 0.0002, "epoch": 4.292435758233804, "step": 5930}, {"loss": 1.1101, "grad_norm": 0.8004671335220337, "learning_rate": 0.0002, "epoch": 4.299674267100977, "step": 5940}, {"loss": 1.14, "grad_norm": 2.154958724975586, "learning_rate": 0.0002, "epoch": 4.306912775968151, "step": 5950}, {"loss": 1.1206, "grad_norm": 0.9163479804992676, "learning_rate": 0.0002, "epoch": 4.314151284835324, "step": 5960}, {"loss": 0.9941, "grad_norm": 0.9151589274406433, "learning_rate": 0.0002, "epoch": 4.321389793702497, "step": 5970}, {"loss": 1.0606, "grad_norm": 0.8624112010002136, "learning_rate": 0.0002, "epoch": 4.328628302569671, "step": 5980}, {"loss": 1.1625, "grad_norm": 0.9357741475105286, "learning_rate": 0.0002, "epoch": 4.335866811436844, "step": 5990}, {"loss": 1.0712, "grad_norm": 1.3482335805892944, "learning_rate": 0.0002, "epoch": 4.3431053203040175, "step": 6000}, {"loss": 1.1224, "grad_norm": 0.7156149744987488, "learning_rate": 0.0002, "epoch": 4.350343829171191, "step": 6010}, {"loss": 1.0753, "grad_norm": 0.8480049967765808, "learning_rate": 0.0002, "epoch": 4.357582338038364, "step": 6020}, {"loss": 1.051, "grad_norm": 0.8262244462966919, "learning_rate": 0.0002, "epoch": 4.364820846905538, "step": 6030}, {"loss": 0.9966, "grad_norm": 0.7733905911445618, "learning_rate": 0.0002, "epoch": 4.372059355772711, "step": 6040}, {"loss": 1.1008, "grad_norm": 0.8553919792175293, "learning_rate": 0.0002, "epoch": 4.379297864639884, "step": 6050}, {"loss": 1.1777, "grad_norm": 0.8666832447052002, "learning_rate": 0.0002, "epoch": 4.386536373507058, "step": 6060}, {"loss": 1.1934, "grad_norm": 0.9168295860290527, "learning_rate": 0.0002, "epoch": 4.393774882374231, "step": 6070}, {"loss": 1.0988, "grad_norm": 0.7315238118171692, "learning_rate": 0.0002, "epoch": 4.4010133912414044, "step": 6080}, {"loss": 1.1599, "grad_norm": 1.020263433456421, "learning_rate": 0.0002, "epoch": 4.408251900108578, "step": 6090}, {"loss": 1.133, "grad_norm": 0.9978243708610535, "learning_rate": 0.0002, "epoch": 4.415490408975751, "step": 6100}, {"loss": 1.1324, "grad_norm": 0.995453953742981, "learning_rate": 0.0002, "epoch": 4.4227289178429245, "step": 6110}, {"loss": 1.0957, "grad_norm": 0.9360884428024292, "learning_rate": 0.0002, "epoch": 4.429967426710098, "step": 6120}, {"loss": 0.9506, "grad_norm": 0.8099448084831238, "learning_rate": 0.0002, "epoch": 4.437205935577271, "step": 6130}, {"loss": 1.0887, "grad_norm": 0.8173841238021851, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6140}, {"loss": 1.1219, "grad_norm": 0.7972666025161743, "learning_rate": 0.0002, "epoch": 4.451682953311618, "step": 6150}, {"loss": 1.0226, "grad_norm": 0.7685779333114624, "learning_rate": 0.0002, "epoch": 4.458921462178791, "step": 6160}, {"loss": 1.0732, "grad_norm": 0.7872623801231384, "learning_rate": 0.0002, "epoch": 4.466159971045965, "step": 6170}, {"loss": 0.9911, "grad_norm": 0.7677070498466492, "learning_rate": 0.0002, "epoch": 4.473398479913138, "step": 6180}, {"loss": 1.0919, "grad_norm": 0.7878316044807434, "learning_rate": 0.0002, "epoch": 4.4806369887803115, "step": 6190}, {"loss": 1.018, "grad_norm": 0.8178079724311829, "learning_rate": 0.0002, "epoch": 4.487875497647485, "step": 6200}, {"loss": 1.0517, "grad_norm": 1.2820082902908325, "learning_rate": 0.0002, "epoch": 4.495114006514658, "step": 6210}, {"loss": 1.3101, "grad_norm": 0.9380832314491272, "learning_rate": 0.0002, "epoch": 4.502352515381832, "step": 6220}, {"loss": 0.9818, "grad_norm": 0.7810422778129578, "learning_rate": 0.0002, "epoch": 4.509591024249005, "step": 6230}, {"loss": 1.1677, "grad_norm": 1.1022917032241821, "learning_rate": 0.0002, "epoch": 4.516829533116178, "step": 6240}, {"loss": 1.1579, "grad_norm": 1.4275553226470947, "learning_rate": 0.0002, "epoch": 4.524068041983352, "step": 6250}, {"loss": 1.3237, "grad_norm": 0.7597777247428894, "learning_rate": 0.0002, "epoch": 4.531306550850525, "step": 6260}, {"loss": 1.1529, "grad_norm": 1.10992431640625, "learning_rate": 0.0002, "epoch": 4.538545059717698, "step": 6270}, {"loss": 1.0732, "grad_norm": 0.8981178998947144, "learning_rate": 0.0002, "epoch": 4.545783568584872, "step": 6280}, {"loss": 1.086, "grad_norm": 0.7863979339599609, "learning_rate": 0.0002, "epoch": 4.553022077452045, "step": 6290}, {"loss": 1.2008, "grad_norm": 0.9071474671363831, "learning_rate": 0.0002, "epoch": 4.5602605863192185, "step": 6300}, {"loss": 1.0916, "grad_norm": 0.7429424524307251, "learning_rate": 0.0002, "epoch": 4.567499095186392, "step": 6310}, {"loss": 1.095, "grad_norm": 1.0767850875854492, "learning_rate": 0.0002, "epoch": 4.574737604053565, "step": 6320}, {"loss": 1.1023, "grad_norm": 0.7885915637016296, "learning_rate": 0.0002, "epoch": 4.581976112920739, "step": 6330}, {"loss": 1.1131, "grad_norm": 0.8350457549095154, "learning_rate": 0.0002, "epoch": 4.589214621787912, "step": 6340}, {"loss": 1.0743, "grad_norm": 0.7853530645370483, "learning_rate": 0.0002, "epoch": 4.596453130655085, "step": 6350}, {"loss": 1.1912, "grad_norm": 1.1220661401748657, "learning_rate": 0.0002, "epoch": 4.603691639522259, "step": 6360}, {"loss": 1.0927, "grad_norm": 0.7959423065185547, "learning_rate": 0.0002, "epoch": 4.610930148389432, "step": 6370}, {"loss": 1.1542, "grad_norm": 0.7782652378082275, "learning_rate": 0.0002, "epoch": 4.618168657256605, "step": 6380}, {"loss": 1.0753, "grad_norm": 0.7882203459739685, "learning_rate": 0.0002, "epoch": 4.625407166123779, "step": 6390}, {"loss": 1.0676, "grad_norm": 0.8841899037361145, "learning_rate": 0.0002, "epoch": 4.632645674990952, "step": 6400}, {"loss": 1.0815, "grad_norm": 0.7936127781867981, "learning_rate": 0.0002, "epoch": 4.6398841838581255, "step": 6410}, {"loss": 1.0198, "grad_norm": 0.9213966131210327, "learning_rate": 0.0002, "epoch": 4.647122692725299, "step": 6420}, {"loss": 0.9872, "grad_norm": 0.9246473908424377, "learning_rate": 0.0002, "epoch": 4.654361201592472, "step": 6430}, {"loss": 1.1309, "grad_norm": 0.766572892665863, "learning_rate": 0.0002, "epoch": 4.661599710459646, "step": 6440}, {"loss": 1.1095, "grad_norm": 0.8596171736717224, "learning_rate": 0.0002, "epoch": 4.668838219326819, "step": 6450}, {"loss": 1.1869, "grad_norm": 0.8482751846313477, "learning_rate": 0.0002, "epoch": 4.676076728193992, "step": 6460}, {"loss": 1.0622, "grad_norm": 1.0826905965805054, "learning_rate": 0.0002, "epoch": 4.683315237061166, "step": 6470}, {"loss": 1.0256, "grad_norm": 1.1048457622528076, "learning_rate": 0.0002, "epoch": 4.690553745928339, "step": 6480}, {"loss": 1.0514, "grad_norm": 0.9429134726524353, "learning_rate": 0.0002, "epoch": 4.697792254795512, "step": 6490}, {"loss": 1.1351, "grad_norm": 0.8587502837181091, "learning_rate": 0.0002, "epoch": 4.705030763662686, "step": 6500}, {"loss": 1.0969, "grad_norm": 1.0387083292007446, "learning_rate": 0.0002, "epoch": 4.712269272529859, "step": 6510}, {"loss": 1.0493, "grad_norm": 0.7471951842308044, "learning_rate": 0.0002, "epoch": 4.7195077813970325, "step": 6520}, {"loss": 1.2632, "grad_norm": 0.8800424933433533, "learning_rate": 0.0002, "epoch": 4.726746290264206, "step": 6530}, {"loss": 1.2126, "grad_norm": 0.8136811852455139, "learning_rate": 0.0002, "epoch": 4.733984799131379, "step": 6540}, {"loss": 1.195, "grad_norm": 0.9910339713096619, "learning_rate": 0.0002, "epoch": 4.741223307998553, "step": 6550}, {"loss": 1.1201, "grad_norm": 1.0679163932800293, "learning_rate": 0.0002, "epoch": 4.748461816865726, "step": 6560}, {"loss": 1.0297, "grad_norm": 0.8468248248100281, "learning_rate": 0.0002, "epoch": 4.755700325732899, "step": 6570}, {"loss": 1.0858, "grad_norm": 0.8771235942840576, "learning_rate": 0.0002, "epoch": 4.762938834600073, "step": 6580}, {"loss": 1.077, "grad_norm": 0.7024846076965332, "learning_rate": 0.0002, "epoch": 4.770177343467246, "step": 6590}, {"loss": 1.0876, "grad_norm": 0.7836683392524719, "learning_rate": 0.0002, "epoch": 4.7774158523344195, "step": 6600}, {"loss": 1.1006, "grad_norm": 0.7717288136482239, "learning_rate": 0.0002, "epoch": 4.784654361201593, "step": 6610}, {"loss": 1.0376, "grad_norm": 0.884183943271637, "learning_rate": 0.0002, "epoch": 4.791892870068766, "step": 6620}, {"loss": 1.1757, "grad_norm": 1.383867621421814, "learning_rate": 0.0002, "epoch": 4.7991313789359396, "step": 6630}, {"loss": 1.0861, "grad_norm": 0.9741523861885071, "learning_rate": 0.0002, "epoch": 4.806369887803113, "step": 6640}, {"loss": 1.0884, "grad_norm": 0.9723693132400513, "learning_rate": 0.0002, "epoch": 4.813608396670286, "step": 6650}, {"loss": 1.2203, "grad_norm": 1.8324809074401855, "learning_rate": 0.0002, "epoch": 4.82084690553746, "step": 6660}, {"loss": 1.0292, "grad_norm": 0.904909074306488, "learning_rate": 0.0002, "epoch": 4.828085414404633, "step": 6670}, {"loss": 1.0349, "grad_norm": 0.7355411648750305, "learning_rate": 0.0002, "epoch": 4.835323923271806, "step": 6680}, {"loss": 1.0793, "grad_norm": 0.8934960961341858, "learning_rate": 0.0002, "epoch": 4.84256243213898, "step": 6690}, {"loss": 1.0375, "grad_norm": 1.4596954584121704, "learning_rate": 0.0002, "epoch": 4.849800941006153, "step": 6700}, {"loss": 1.1065, "grad_norm": 0.8310341238975525, "learning_rate": 0.0002, "epoch": 4.8570394498733265, "step": 6710}, {"loss": 1.1089, "grad_norm": 0.9709894061088562, "learning_rate": 0.0002, "epoch": 4.8642779587405, "step": 6720}, {"loss": 1.0069, "grad_norm": 0.852142333984375, "learning_rate": 0.0002, "epoch": 4.871516467607673, "step": 6730}, {"loss": 1.0507, "grad_norm": 1.0643625259399414, "learning_rate": 0.0002, "epoch": 4.878754976474847, "step": 6740}, {"loss": 1.056, "grad_norm": 0.9419508576393127, "learning_rate": 0.0002, "epoch": 4.88599348534202, "step": 6750}, {"loss": 1.1995, "grad_norm": 1.1818498373031616, "learning_rate": 0.0002, "epoch": 4.893231994209193, "step": 6760}, {"loss": 1.0925, "grad_norm": 0.9369569420814514, "learning_rate": 0.0002, "epoch": 4.900470503076367, "step": 6770}, {"loss": 1.1648, "grad_norm": 0.7012579441070557, "learning_rate": 0.0002, "epoch": 4.90770901194354, "step": 6780}, {"loss": 1.0926, "grad_norm": 0.9109319448471069, "learning_rate": 0.0002, "epoch": 4.914947520810713, "step": 6790}, {"loss": 1.0358, "grad_norm": 0.8077534437179565, "learning_rate": 0.0002, "epoch": 4.922186029677887, "step": 6800}, {"loss": 1.2549, "grad_norm": 0.7571148872375488, "learning_rate": 0.0002, "epoch": 4.92942453854506, "step": 6810}, {"loss": 0.9638, "grad_norm": 0.7325633764266968, "learning_rate": 0.0002, "epoch": 4.9366630474122335, "step": 6820}, {"loss": 1.0128, "grad_norm": 0.8465084433555603, "learning_rate": 0.0002, "epoch": 4.943901556279407, "step": 6830}, {"loss": 1.153, "grad_norm": 0.8753737807273865, "learning_rate": 0.0002, "epoch": 4.95114006514658, "step": 6840}, {"loss": 1.0247, "grad_norm": 0.9421748518943787, "learning_rate": 0.0002, "epoch": 4.958378574013754, "step": 6850}, {"loss": 1.1483, "grad_norm": 0.8245896697044373, "learning_rate": 0.0002, "epoch": 4.965617082880927, "step": 6860}, {"loss": 0.9905, "grad_norm": 0.8823089599609375, "learning_rate": 0.0002, "epoch": 4.9728555917481, "step": 6870}, {"loss": 1.1664, "grad_norm": 0.8406389355659485, "learning_rate": 0.0002, "epoch": 4.980094100615274, "step": 6880}, {"loss": 1.0944, "grad_norm": 0.9732868075370789, "learning_rate": 0.0002, "epoch": 4.987332609482447, "step": 6890}, {"loss": 1.1776, "grad_norm": 2.125141143798828, "learning_rate": 0.0002, "epoch": 4.99457111834962, "step": 6900}, {"eval_loss": 1.445176601409912, "eval_runtime": 27.2351, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.999638074556641, "step": 6907}, {"loss": 1.1362, "grad_norm": 0.9465792775154114, "learning_rate": 0.0002, "epoch": 5.001809627216793, "step": 6910}, {"loss": 0.982, "grad_norm": 1.2834891080856323, "learning_rate": 0.0002, "epoch": 5.009048136083966, "step": 6920}, {"loss": 0.9803, "grad_norm": 1.0297378301620483, "learning_rate": 0.0002, "epoch": 5.01628664495114, "step": 6930}, {"loss": 1.0447, "grad_norm": 1.1705161333084106, "learning_rate": 0.0002, "epoch": 5.023525153818313, "step": 6940}, {"loss": 1.0113, "grad_norm": 0.8293961882591248, "learning_rate": 0.0002, "epoch": 5.030763662685486, "step": 6950}, {"loss": 0.9203, "grad_norm": 1.0422210693359375, "learning_rate": 0.0002, "epoch": 5.03800217155266, "step": 6960}, {"loss": 1.0553, "grad_norm": 1.116104245185852, "learning_rate": 0.0002, "epoch": 5.045240680419833, "step": 6970}, {"loss": 0.9011, "grad_norm": 1.5118416547775269, "learning_rate": 0.0002, "epoch": 5.0524791892870065, "step": 6980}, {"loss": 0.9969, "grad_norm": 0.8383979797363281, "learning_rate": 0.0002, "epoch": 5.05971769815418, "step": 6990}, {"loss": 0.9659, "grad_norm": 1.3378649950027466, "learning_rate": 0.0002, "epoch": 5.066956207021353, "step": 7000}, {"loss": 1.0212, "grad_norm": 1.1840510368347168, "learning_rate": 0.0002, "epoch": 5.0741947158885266, "step": 7010}, {"loss": 0.9939, "grad_norm": 1.2354751825332642, "learning_rate": 0.0002, "epoch": 5.0814332247557, "step": 7020}, {"loss": 0.9831, "grad_norm": 1.3830451965332031, "learning_rate": 0.0002, "epoch": 5.088671733622873, "step": 7030}, {"loss": 1.1827, "grad_norm": 0.8101674318313599, "learning_rate": 0.0002, "epoch": 5.095910242490047, "step": 7040}, {"loss": 0.9255, "grad_norm": 0.897982656955719, "learning_rate": 0.0002, "epoch": 5.10314875135722, "step": 7050}, {"loss": 0.8784, "grad_norm": 1.2049678564071655, "learning_rate": 0.0002, "epoch": 5.110387260224393, "step": 7060}, {"loss": 1.0182, "grad_norm": 1.5912116765975952, "learning_rate": 0.0002, "epoch": 5.117625769091567, "step": 7070}, {"loss": 1.0909, "grad_norm": 0.9261530041694641, "learning_rate": 0.0002, "epoch": 5.12486427795874, "step": 7080}, {"loss": 0.9603, "grad_norm": 1.1454812288284302, "learning_rate": 0.0002, "epoch": 5.1321027868259135, "step": 7090}, {"loss": 0.9149, "grad_norm": 1.0049978494644165, "learning_rate": 0.0002, "epoch": 5.139341295693087, "step": 7100}, {"loss": 0.9463, "grad_norm": 1.4513251781463623, "learning_rate": 0.0002, "epoch": 5.14657980456026, "step": 7110}, {"loss": 0.8995, "grad_norm": 0.9800849556922913, "learning_rate": 0.0002, "epoch": 5.153818313427434, "step": 7120}, {"loss": 0.9835, "grad_norm": 0.9698708653450012, "learning_rate": 0.0002, "epoch": 5.161056822294607, "step": 7130}, {"loss": 0.9672, "grad_norm": 1.1126646995544434, "learning_rate": 0.0002, "epoch": 5.16829533116178, "step": 7140}, {"loss": 0.9384, "grad_norm": 0.9248330593109131, "learning_rate": 0.0002, "epoch": 5.175533840028954, "step": 7150}, {"loss": 0.826, "grad_norm": 0.7967255711555481, "learning_rate": 0.0002, "epoch": 5.182772348896127, "step": 7160}, {"loss": 1.0078, "grad_norm": 0.9933333992958069, "learning_rate": 0.0002, "epoch": 5.1900108577633, "step": 7170}, {"loss": 1.0276, "grad_norm": 1.0080649852752686, "learning_rate": 0.0002, "epoch": 5.197249366630474, "step": 7180}, {"loss": 1.0201, "grad_norm": 1.3954921960830688, "learning_rate": 0.0002, "epoch": 5.204487875497647, "step": 7190}, {"loss": 1.0863, "grad_norm": 1.2386271953582764, "learning_rate": 0.0002, "epoch": 5.2117263843648205, "step": 7200}, {"loss": 0.8863, "grad_norm": 1.2379488945007324, "learning_rate": 0.0002, "epoch": 5.218964893231994, "step": 7210}, {"loss": 1.0518, "grad_norm": 0.9882503747940063, "learning_rate": 0.0002, "epoch": 5.226203402099167, "step": 7220}, {"loss": 0.9834, "grad_norm": 1.1728729009628296, "learning_rate": 0.0002, "epoch": 5.233441910966341, "step": 7230}, {"loss": 0.9269, "grad_norm": 0.9849673509597778, "learning_rate": 0.0002, "epoch": 5.240680419833514, "step": 7240}, {"loss": 0.9935, "grad_norm": 1.177639365196228, "learning_rate": 0.0002, "epoch": 5.247918928700687, "step": 7250}, {"loss": 1.0639, "grad_norm": 1.2395055294036865, "learning_rate": 0.0002, "epoch": 5.255157437567861, "step": 7260}, {"loss": 1.0138, "grad_norm": 1.3999171257019043, "learning_rate": 0.0002, "epoch": 5.262395946435034, "step": 7270}, {"loss": 0.9745, "grad_norm": 0.7698732018470764, "learning_rate": 0.0002, "epoch": 5.269634455302207, "step": 7280}, {"loss": 1.0389, "grad_norm": 0.9167453646659851, "learning_rate": 0.0002, "epoch": 5.276872964169381, "step": 7290}, {"loss": 0.9858, "grad_norm": 1.113830804824829, "learning_rate": 0.0002, "epoch": 5.284111473036554, "step": 7300}, {"loss": 0.9577, "grad_norm": 0.9644396901130676, "learning_rate": 0.0002, "epoch": 5.2913499819037275, "step": 7310}, {"loss": 1.0556, "grad_norm": 1.462435007095337, "learning_rate": 0.0002, "epoch": 5.298588490770901, "step": 7320}, {"loss": 0.871, "grad_norm": 0.9406287670135498, "learning_rate": 0.0002, "epoch": 5.305826999638074, "step": 7330}, {"loss": 1.0022, "grad_norm": 0.9698247909545898, "learning_rate": 0.0002, "epoch": 5.313065508505248, "step": 7340}, {"loss": 0.915, "grad_norm": 1.12003755569458, "learning_rate": 0.0002, "epoch": 5.320304017372421, "step": 7350}, {"loss": 0.9838, "grad_norm": 1.598681926727295, "learning_rate": 0.0002, "epoch": 5.327542526239594, "step": 7360}, {"loss": 1.0, "grad_norm": 1.0450010299682617, "learning_rate": 0.0002, "epoch": 5.334781035106768, "step": 7370}, {"loss": 0.9983, "grad_norm": 0.8680008053779602, "learning_rate": 0.0002, "epoch": 5.342019543973941, "step": 7380}, {"loss": 0.9851, "grad_norm": 1.0115476846694946, "learning_rate": 0.0002, "epoch": 5.349258052841114, "step": 7390}, {"loss": 1.0702, "grad_norm": 0.9589748382568359, "learning_rate": 0.0002, "epoch": 5.356496561708288, "step": 7400}, {"loss": 0.9366, "grad_norm": 0.6729998588562012, "learning_rate": 0.0002, "epoch": 5.363735070575461, "step": 7410}, {"loss": 1.0126, "grad_norm": 0.9246699213981628, "learning_rate": 0.0002, "epoch": 5.3709735794426345, "step": 7420}, {"loss": 0.9815, "grad_norm": 1.1266791820526123, "learning_rate": 0.0002, "epoch": 5.378212088309808, "step": 7430}, {"loss": 1.1166, "grad_norm": 1.8056942224502563, "learning_rate": 0.0002, "epoch": 5.385450597176981, "step": 7440}, {"loss": 0.9604, "grad_norm": 0.9802932739257812, "learning_rate": 0.0002, "epoch": 5.392689106044155, "step": 7450}, {"loss": 0.9656, "grad_norm": 1.0504707098007202, "learning_rate": 0.0002, "epoch": 5.399927614911328, "step": 7460}, {"loss": 1.0132, "grad_norm": 1.1915022134780884, "learning_rate": 0.0002, "epoch": 5.407166123778501, "step": 7470}, {"loss": 1.0041, "grad_norm": 1.1856611967086792, "learning_rate": 0.0002, "epoch": 5.414404632645675, "step": 7480}, {"loss": 0.9747, "grad_norm": 1.292152762413025, "learning_rate": 0.0002, "epoch": 5.421643141512848, "step": 7490}, {"loss": 0.9659, "grad_norm": 1.2675740718841553, "learning_rate": 0.0002, "epoch": 5.4288816503800215, "step": 7500}, {"loss": 1.0271, "grad_norm": 1.4034695625305176, "learning_rate": 0.0002, "epoch": 5.436120159247195, "step": 7510}, {"loss": 1.0318, "grad_norm": 0.984588623046875, "learning_rate": 0.0002, "epoch": 5.443358668114368, "step": 7520}, {"loss": 1.0726, "grad_norm": 0.8419108390808105, "learning_rate": 0.0002, "epoch": 5.450597176981542, "step": 7530}, {"loss": 1.0499, "grad_norm": 1.0270143747329712, "learning_rate": 0.0002, "epoch": 5.457835685848715, "step": 7540}, {"loss": 0.9804, "grad_norm": 2.2158689498901367, "learning_rate": 0.0002, "epoch": 5.465074194715888, "step": 7550}, {"loss": 0.9856, "grad_norm": 1.0740524530410767, "learning_rate": 0.0002, "epoch": 5.472312703583062, "step": 7560}, {"loss": 1.0522, "grad_norm": 1.3804482221603394, "learning_rate": 0.0002, "epoch": 5.479551212450235, "step": 7570}, {"loss": 1.0297, "grad_norm": 0.9428979754447937, "learning_rate": 0.0002, "epoch": 5.486789721317408, "step": 7580}, {"loss": 1.0906, "grad_norm": 0.9548295736312866, "learning_rate": 0.0002, "epoch": 5.494028230184582, "step": 7590}, {"loss": 0.8853, "grad_norm": 1.0691065788269043, "learning_rate": 0.0002, "epoch": 5.501266739051755, "step": 7600}, {"loss": 1.0375, "grad_norm": 1.0987380743026733, "learning_rate": 0.0002, "epoch": 5.5085052479189285, "step": 7610}, {"loss": 1.0162, "grad_norm": 0.9483979344367981, "learning_rate": 0.0002, "epoch": 5.515743756786102, "step": 7620}, {"loss": 1.105, "grad_norm": 1.16624915599823, "learning_rate": 0.0002, "epoch": 5.522982265653275, "step": 7630}, {"loss": 0.8695, "grad_norm": 0.8563777208328247, "learning_rate": 0.0002, "epoch": 5.530220774520449, "step": 7640}, {"loss": 0.9297, "grad_norm": 1.268186092376709, "learning_rate": 0.0002, "epoch": 5.537459283387622, "step": 7650}, {"loss": 1.1152, "grad_norm": 1.0752092599868774, "learning_rate": 0.0002, "epoch": 5.544697792254795, "step": 7660}, {"loss": 0.9344, "grad_norm": 1.210389256477356, "learning_rate": 0.0002, "epoch": 5.551936301121969, "step": 7670}, {"loss": 1.0349, "grad_norm": 1.669063925743103, "learning_rate": 0.0002, "epoch": 5.559174809989142, "step": 7680}, {"loss": 0.9833, "grad_norm": 1.038020133972168, "learning_rate": 0.0002, "epoch": 5.566413318856315, "step": 7690}, {"loss": 0.8907, "grad_norm": 1.316673994064331, "learning_rate": 0.0002, "epoch": 5.573651827723489, "step": 7700}, {"loss": 0.9614, "grad_norm": 1.029935359954834, "learning_rate": 0.0002, "epoch": 5.580890336590662, "step": 7710}, {"loss": 1.0409, "grad_norm": 0.9401940703392029, "learning_rate": 0.0002, "epoch": 5.5881288454578355, "step": 7720}, {"loss": 0.9272, "grad_norm": 2.4811816215515137, "learning_rate": 0.0002, "epoch": 5.595367354325009, "step": 7730}, {"loss": 0.992, "grad_norm": 1.0329105854034424, "learning_rate": 0.0002, "epoch": 5.602605863192182, "step": 7740}, {"loss": 0.9493, "grad_norm": 1.479629635810852, "learning_rate": 0.0002, "epoch": 5.609844372059356, "step": 7750}, {"loss": 1.0727, "grad_norm": 1.9232319593429565, "learning_rate": 0.0002, "epoch": 5.617082880926529, "step": 7760}, {"loss": 1.0741, "grad_norm": 1.0055509805679321, "learning_rate": 0.0002, "epoch": 5.624321389793702, "step": 7770}, {"loss": 1.0731, "grad_norm": 1.0037437677383423, "learning_rate": 0.0002, "epoch": 5.631559898660876, "step": 7780}, {"loss": 1.0913, "grad_norm": 1.4245030879974365, "learning_rate": 0.0002, "epoch": 5.638798407528049, "step": 7790}, {"loss": 0.9711, "grad_norm": 1.080687403678894, "learning_rate": 0.0002, "epoch": 5.646036916395222, "step": 7800}, {"loss": 1.0276, "grad_norm": 1.354953408241272, "learning_rate": 0.0002, "epoch": 5.653275425262396, "step": 7810}, {"loss": 1.0534, "grad_norm": 0.8966761231422424, "learning_rate": 0.0002, "epoch": 5.660513934129569, "step": 7820}, {"loss": 1.0662, "grad_norm": 1.0675480365753174, "learning_rate": 0.0002, "epoch": 5.6677524429967425, "step": 7830}, {"loss": 1.1077, "grad_norm": 1.2104216814041138, "learning_rate": 0.0002, "epoch": 5.674990951863916, "step": 7840}, {"loss": 0.9627, "grad_norm": 1.105790376663208, "learning_rate": 0.0002, "epoch": 5.682229460731089, "step": 7850}, {"loss": 1.0483, "grad_norm": 1.0915391445159912, "learning_rate": 0.0002, "epoch": 5.689467969598263, "step": 7860}, {"loss": 1.0291, "grad_norm": 0.8957812786102295, "learning_rate": 0.0002, "epoch": 5.696706478465436, "step": 7870}, {"loss": 0.9785, "grad_norm": 1.9189311265945435, "learning_rate": 0.0002, "epoch": 5.703944987332609, "step": 7880}, {"loss": 1.0076, "grad_norm": 1.0867321491241455, "learning_rate": 0.0002, "epoch": 5.711183496199783, "step": 7890}, {"loss": 1.0236, "grad_norm": 1.0233147144317627, "learning_rate": 0.0002, "epoch": 5.718422005066956, "step": 7900}, {"loss": 0.9872, "grad_norm": 1.16460382938385, "learning_rate": 0.0002, "epoch": 5.7256605139341294, "step": 7910}, {"loss": 1.0762, "grad_norm": 1.1098358631134033, "learning_rate": 0.0002, "epoch": 5.732899022801303, "step": 7920}, {"loss": 0.9937, "grad_norm": 0.8555701375007629, "learning_rate": 0.0002, "epoch": 5.740137531668476, "step": 7930}, {"loss": 1.0081, "grad_norm": 0.9885705709457397, "learning_rate": 0.0002, "epoch": 5.7473760405356495, "step": 7940}, {"loss": 0.9909, "grad_norm": 0.9184203147888184, "learning_rate": 0.0002, "epoch": 5.754614549402823, "step": 7950}, {"loss": 1.0767, "grad_norm": 0.9653698205947876, "learning_rate": 0.0002, "epoch": 5.761853058269996, "step": 7960}, {"loss": 0.9317, "grad_norm": 1.0014251470565796, "learning_rate": 0.0002, "epoch": 5.76909156713717, "step": 7970}, {"loss": 1.0271, "grad_norm": 1.004701018333435, "learning_rate": 0.0002, "epoch": 5.776330076004343, "step": 7980}, {"loss": 1.0397, "grad_norm": 0.950577974319458, "learning_rate": 0.0002, "epoch": 5.783568584871516, "step": 7990}, {"loss": 0.9725, "grad_norm": 1.2986834049224854, "learning_rate": 0.0002, "epoch": 5.79080709373869, "step": 8000}, {"loss": 1.039, "grad_norm": 1.3353424072265625, "learning_rate": 0.0002, "epoch": 5.798045602605863, "step": 8010}, {"loss": 1.0626, "grad_norm": 0.7650562524795532, "learning_rate": 0.0002, "epoch": 5.8052841114730365, "step": 8020}, {"loss": 1.0802, "grad_norm": 1.0156235694885254, "learning_rate": 0.0002, "epoch": 5.81252262034021, "step": 8030}, {"loss": 1.0185, "grad_norm": 1.3092900514602661, "learning_rate": 0.0002, "epoch": 5.819761129207383, "step": 8040}, {"loss": 0.9905, "grad_norm": 1.184428095817566, "learning_rate": 0.0002, "epoch": 5.826999638074557, "step": 8050}, {"loss": 1.0548, "grad_norm": 0.979401707649231, "learning_rate": 0.0002, "epoch": 5.83423814694173, "step": 8060}, {"loss": 0.9721, "grad_norm": 1.3557400703430176, "learning_rate": 0.0002, "epoch": 5.841476655808903, "step": 8070}, {"loss": 1.0235, "grad_norm": 0.8429333567619324, "learning_rate": 0.0002, "epoch": 5.848715164676077, "step": 8080}, {"loss": 0.952, "grad_norm": 1.3167692422866821, "learning_rate": 0.0002, "epoch": 5.85595367354325, "step": 8090}, {"loss": 0.9609, "grad_norm": 0.9750998020172119, "learning_rate": 0.0002, "epoch": 5.863192182410423, "step": 8100}, {"loss": 1.0789, "grad_norm": 1.1869813203811646, "learning_rate": 0.0002, "epoch": 5.870430691277597, "step": 8110}, {"loss": 1.0331, "grad_norm": 1.508615255355835, "learning_rate": 0.0002, "epoch": 5.87766920014477, "step": 8120}, {"loss": 1.0171, "grad_norm": 0.9439908266067505, "learning_rate": 0.0002, "epoch": 5.8849077090119435, "step": 8130}, {"loss": 0.9682, "grad_norm": 0.910508930683136, "learning_rate": 0.0002, "epoch": 5.892146217879117, "step": 8140}, {"loss": 1.0032, "grad_norm": 1.111501932144165, "learning_rate": 0.0002, "epoch": 5.89938472674629, "step": 8150}, {"loss": 1.0266, "grad_norm": 0.726554274559021, "learning_rate": 0.0002, "epoch": 5.906623235613464, "step": 8160}, {"loss": 1.0681, "grad_norm": 1.1084556579589844, "learning_rate": 0.0002, "epoch": 5.913861744480637, "step": 8170}, {"loss": 0.969, "grad_norm": 0.9695167541503906, "learning_rate": 0.0002, "epoch": 5.92110025334781, "step": 8180}, {"loss": 0.9858, "grad_norm": 1.1169592142105103, "learning_rate": 0.0002, "epoch": 5.928338762214984, "step": 8190}, {"loss": 1.0924, "grad_norm": 1.5116780996322632, "learning_rate": 0.0002, "epoch": 5.935577271082157, "step": 8200}, {"loss": 0.878, "grad_norm": 1.0073388814926147, "learning_rate": 0.0002, "epoch": 5.94281577994933, "step": 8210}, {"loss": 1.0462, "grad_norm": 0.9323263168334961, "learning_rate": 0.0002, "epoch": 5.950054288816504, "step": 8220}, {"loss": 1.0291, "grad_norm": 0.9422887563705444, "learning_rate": 0.0002, "epoch": 5.957292797683677, "step": 8230}, {"loss": 0.953, "grad_norm": 0.9691047668457031, "learning_rate": 0.0002, "epoch": 5.9645313065508505, "step": 8240}, {"loss": 0.9842, "grad_norm": 0.9650622606277466, "learning_rate": 0.0002, "epoch": 5.971769815418024, "step": 8250}, {"loss": 0.907, "grad_norm": 1.077958345413208, "learning_rate": 0.0002, "epoch": 5.979008324285197, "step": 8260}, {"loss": 0.9162, "grad_norm": 0.8946306109428406, "learning_rate": 0.0002, "epoch": 5.986246833152371, "step": 8270}, {"loss": 1.0439, "grad_norm": 1.34098219871521, "learning_rate": 0.0002, "epoch": 5.993485342019544, "step": 8280}]} +{"epoch": 6.999638074556641, "step": 9670, "epoch_duration": 1214.7583508491516, "total_accumulated_duration": 9110.399286270142, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3048.73388671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}, {"eval_loss": 1.4217946529388428, "eval_runtime": 27.1596, "eval_samples_per_second": 16.053, "eval_steps_per_second": 2.025, "epoch": 2.9996380745566413, "step": 4144}, {"loss": 1.2152, "grad_norm": 0.7043836116790771, "learning_rate": 0.0002, "epoch": 3.0039811798769454, "step": 4150}, {"loss": 1.1664, "grad_norm": 0.6806283593177795, "learning_rate": 0.0002, "epoch": 3.0112196887441187, "step": 4160}, {"loss": 1.292, "grad_norm": 0.7684550285339355, "learning_rate": 0.0002, "epoch": 3.018458197611292, "step": 4170}, {"loss": 1.3467, "grad_norm": 0.7895237803459167, "learning_rate": 0.0002, "epoch": 3.0256967064784654, "step": 4180}, {"loss": 1.1324, "grad_norm": 0.7464531064033508, "learning_rate": 0.0002, "epoch": 3.032935215345639, "step": 4190}, {"loss": 1.1614, "grad_norm": 0.9358500838279724, "learning_rate": 0.0002, "epoch": 3.040173724212812, "step": 4200}, {"loss": 1.1834, "grad_norm": 1.1066628694534302, "learning_rate": 0.0002, "epoch": 3.0474122330799855, "step": 4210}, {"loss": 1.1557, "grad_norm": 0.6663267612457275, "learning_rate": 0.0002, "epoch": 3.054650741947159, "step": 4220}, {"loss": 1.1707, "grad_norm": 0.6669464707374573, "learning_rate": 0.0002, "epoch": 3.0618892508143323, "step": 4230}, {"loss": 1.1841, "grad_norm": 0.7052164077758789, "learning_rate": 0.0002, "epoch": 3.0691277596815056, "step": 4240}, {"loss": 1.2913, "grad_norm": 0.6118432879447937, "learning_rate": 0.0002, "epoch": 3.076366268548679, "step": 4250}, {"loss": 1.1526, "grad_norm": 0.6915903687477112, "learning_rate": 0.0002, "epoch": 3.0836047774158524, "step": 4260}, {"loss": 1.1348, "grad_norm": 0.7441644668579102, "learning_rate": 0.0002, "epoch": 3.0908432862830257, "step": 4270}, {"loss": 1.1672, "grad_norm": 0.823850691318512, "learning_rate": 0.0002, "epoch": 3.098081795150199, "step": 4280}, {"loss": 1.2655, "grad_norm": 0.9677883386611938, "learning_rate": 0.0002, "epoch": 3.1053203040173725, "step": 4290}, {"loss": 1.1794, "grad_norm": 0.7002579569816589, "learning_rate": 0.0002, "epoch": 3.112558812884546, "step": 4300}, {"loss": 1.135, "grad_norm": 0.778789758682251, "learning_rate": 0.0002, "epoch": 3.119797321751719, "step": 4310}, {"loss": 1.0818, "grad_norm": 0.7236007452011108, "learning_rate": 0.0002, "epoch": 3.1270358306188926, "step": 4320}, {"loss": 1.1803, "grad_norm": 0.8809133768081665, "learning_rate": 0.0002, "epoch": 3.134274339486066, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.7924913167953491, "learning_rate": 0.0002, "epoch": 3.1415128483532393, "step": 4340}, {"loss": 1.1413, "grad_norm": 0.7437422275543213, "learning_rate": 0.0002, "epoch": 3.1487513572204127, "step": 4350}, {"loss": 1.2088, "grad_norm": 0.6428450345993042, "learning_rate": 0.0002, "epoch": 3.155989866087586, "step": 4360}, {"loss": 1.3032, "grad_norm": 0.7922873497009277, "learning_rate": 0.0002, "epoch": 3.1632283749547594, "step": 4370}, {"loss": 1.216, "grad_norm": 0.5252506732940674, "learning_rate": 0.0002, "epoch": 3.1704668838219328, "step": 4380}, {"loss": 1.1297, "grad_norm": 0.8570457696914673, "learning_rate": 0.0002, "epoch": 3.177705392689106, "step": 4390}, {"loss": 1.0994, "grad_norm": 0.7218987345695496, "learning_rate": 0.0002, "epoch": 3.1849439015562795, "step": 4400}, {"loss": 1.2891, "grad_norm": 0.6921393275260925, "learning_rate": 0.0002, "epoch": 3.192182410423453, "step": 4410}, {"loss": 1.2668, "grad_norm": 0.7386137843132019, "learning_rate": 0.0002, "epoch": 3.199420919290626, "step": 4420}, {"loss": 1.1654, "grad_norm": 0.6227759122848511, "learning_rate": 0.0002, "epoch": 3.2066594281577996, "step": 4430}, {"loss": 1.1752, "grad_norm": 0.7180278897285461, "learning_rate": 0.0002, "epoch": 3.213897937024973, "step": 4440}, {"loss": 1.1757, "grad_norm": 0.745830774307251, "learning_rate": 0.0002, "epoch": 3.2211364458921463, "step": 4450}, {"loss": 1.234, "grad_norm": 0.6766072511672974, "learning_rate": 0.0002, "epoch": 3.2283749547593197, "step": 4460}, {"loss": 1.1999, "grad_norm": 0.8325067162513733, "learning_rate": 0.0002, "epoch": 3.235613463626493, "step": 4470}, {"loss": 1.1606, "grad_norm": 0.7148305177688599, "learning_rate": 0.0002, "epoch": 3.2428519724936664, "step": 4480}, {"loss": 1.1383, "grad_norm": 0.7752676010131836, "learning_rate": 0.0002, "epoch": 3.25009048136084, "step": 4490}, {"loss": 1.3006, "grad_norm": 0.6776860952377319, "learning_rate": 0.0002, "epoch": 3.257328990228013, "step": 4500}, {"loss": 1.0796, "grad_norm": 0.704359769821167, "learning_rate": 0.0002, "epoch": 3.2645674990951865, "step": 4510}, {"loss": 1.2496, "grad_norm": 0.6880282163619995, "learning_rate": 0.0002, "epoch": 3.27180600796236, "step": 4520}, {"loss": 1.0947, "grad_norm": 0.8179270029067993, "learning_rate": 0.0002, "epoch": 3.2790445168295332, "step": 4530}, {"loss": 1.1909, "grad_norm": 0.6718448996543884, "learning_rate": 0.0002, "epoch": 3.2862830256967066, "step": 4540}, {"loss": 1.2708, "grad_norm": 0.8300657868385315, "learning_rate": 0.0002, "epoch": 3.29352153456388, "step": 4550}, {"loss": 1.2594, "grad_norm": 0.6433690786361694, "learning_rate": 0.0002, "epoch": 3.3007600434310533, "step": 4560}, {"loss": 1.2479, "grad_norm": 0.690262496471405, "learning_rate": 0.0002, "epoch": 3.3079985522982267, "step": 4570}, {"loss": 1.1342, "grad_norm": 0.7022852301597595, "learning_rate": 0.0002, "epoch": 3.3152370611654, "step": 4580}, {"loss": 1.0844, "grad_norm": 0.6438387632369995, "learning_rate": 0.0002, "epoch": 3.3224755700325734, "step": 4590}, {"loss": 1.17, "grad_norm": 0.6866899132728577, "learning_rate": 0.0002, "epoch": 3.329714078899747, "step": 4600}, {"loss": 1.1289, "grad_norm": 0.8233968019485474, "learning_rate": 0.0002, "epoch": 3.33695258776692, "step": 4610}, {"loss": 1.1855, "grad_norm": 0.7251574993133545, "learning_rate": 0.0002, "epoch": 3.3441910966340935, "step": 4620}, {"loss": 1.3403, "grad_norm": 0.7855110168457031, "learning_rate": 0.0002, "epoch": 3.351429605501267, "step": 4630}, {"loss": 1.2922, "grad_norm": 0.8487356305122375, "learning_rate": 0.0002, "epoch": 3.3586681143684403, "step": 4640}, {"loss": 1.2462, "grad_norm": 0.6429011225700378, "learning_rate": 0.0002, "epoch": 3.3659066232356136, "step": 4650}, {"loss": 1.129, "grad_norm": 0.7095270156860352, "learning_rate": 0.0002, "epoch": 3.373145132102787, "step": 4660}, {"loss": 1.262, "grad_norm": 0.6792303323745728, "learning_rate": 0.0002, "epoch": 3.3803836409699604, "step": 4670}, {"loss": 1.256, "grad_norm": 0.6784825921058655, "learning_rate": 0.0002, "epoch": 3.3876221498371337, "step": 4680}, {"loss": 1.0838, "grad_norm": 0.6362888216972351, "learning_rate": 0.0002, "epoch": 3.394860658704307, "step": 4690}, {"loss": 1.2165, "grad_norm": 0.7794778943061829, "learning_rate": 0.0002, "epoch": 3.4020991675714805, "step": 4700}, {"loss": 1.0644, "grad_norm": 0.7287485003471375, "learning_rate": 0.0002, "epoch": 3.409337676438654, "step": 4710}, {"loss": 1.2925, "grad_norm": 0.6481451392173767, "learning_rate": 0.0002, "epoch": 3.416576185305827, "step": 4720}, {"loss": 1.2121, "grad_norm": 0.9200371503829956, "learning_rate": 0.0002, "epoch": 3.4238146941730006, "step": 4730}, {"loss": 1.072, "grad_norm": 1.074180245399475, "learning_rate": 0.0002, "epoch": 3.431053203040174, "step": 4740}, {"loss": 1.0421, "grad_norm": 0.6722986698150635, "learning_rate": 0.0002, "epoch": 3.438291711907347, "step": 4750}, {"loss": 1.2258, "grad_norm": 0.7945933938026428, "learning_rate": 0.0002, "epoch": 3.44553022077452, "step": 4760}, {"loss": 1.0927, "grad_norm": 0.7624640464782715, "learning_rate": 0.0002, "epoch": 3.4527687296416936, "step": 4770}, {"loss": 1.2428, "grad_norm": 0.7763656377792358, "learning_rate": 0.0002, "epoch": 3.460007238508867, "step": 4780}, {"loss": 1.2584, "grad_norm": 0.7736947536468506, "learning_rate": 0.0002, "epoch": 3.4672457473760403, "step": 4790}, {"loss": 1.1953, "grad_norm": 0.8450354933738708, "learning_rate": 0.0002, "epoch": 3.4744842562432137, "step": 4800}, {"loss": 1.1362, "grad_norm": 0.6480133533477783, "learning_rate": 0.0002, "epoch": 3.481722765110387, "step": 4810}, {"loss": 1.1882, "grad_norm": 0.8437445759773254, "learning_rate": 0.0002, "epoch": 3.4889612739775604, "step": 4820}, {"loss": 1.1519, "grad_norm": 0.7781730890274048, "learning_rate": 0.0002, "epoch": 3.4961997828447338, "step": 4830}, {"loss": 1.1836, "grad_norm": 0.8523228168487549, "learning_rate": 0.0002, "epoch": 3.503438291711907, "step": 4840}, {"loss": 1.1672, "grad_norm": 0.6236732006072998, "learning_rate": 0.0002, "epoch": 3.5106768005790805, "step": 4850}, {"loss": 1.1926, "grad_norm": 0.7500787377357483, "learning_rate": 0.0002, "epoch": 3.517915309446254, "step": 4860}, {"loss": 1.1998, "grad_norm": 0.7665374875068665, "learning_rate": 0.0002, "epoch": 3.5251538183134272, "step": 4870}, {"loss": 1.1551, "grad_norm": 0.787857711315155, "learning_rate": 0.0002, "epoch": 3.5323923271806006, "step": 4880}, {"loss": 1.2758, "grad_norm": 0.970595121383667, "learning_rate": 0.0002, "epoch": 3.539630836047774, "step": 4890}, {"loss": 1.1274, "grad_norm": 0.6409347057342529, "learning_rate": 0.0002, "epoch": 3.5468693449149473, "step": 4900}, {"loss": 1.1596, "grad_norm": 0.888551652431488, "learning_rate": 0.0002, "epoch": 3.5541078537821207, "step": 4910}, {"loss": 1.1644, "grad_norm": 1.0808377265930176, "learning_rate": 0.0002, "epoch": 3.561346362649294, "step": 4920}, {"loss": 1.2564, "grad_norm": 0.7501053214073181, "learning_rate": 0.0002, "epoch": 3.5685848715164674, "step": 4930}, {"loss": 1.2351, "grad_norm": 0.7375240325927734, "learning_rate": 0.0002, "epoch": 3.575823380383641, "step": 4940}, {"loss": 1.3568, "grad_norm": 0.7075039744377136, "learning_rate": 0.0002, "epoch": 3.583061889250814, "step": 4950}, {"loss": 1.3355, "grad_norm": 0.939337432384491, "learning_rate": 0.0002, "epoch": 3.5903003981179875, "step": 4960}, {"loss": 1.1722, "grad_norm": 0.6717396974563599, "learning_rate": 0.0002, "epoch": 3.597538906985161, "step": 4970}, {"loss": 1.1186, "grad_norm": 0.7141643762588501, "learning_rate": 0.0002, "epoch": 3.6047774158523342, "step": 4980}, {"loss": 1.1011, "grad_norm": 0.7109216451644897, "learning_rate": 0.0002, "epoch": 3.6120159247195076, "step": 4990}, {"loss": 1.2178, "grad_norm": 0.7020776867866516, "learning_rate": 0.0002, "epoch": 3.619254433586681, "step": 5000}, {"loss": 1.1939, "grad_norm": 0.7158873677253723, "learning_rate": 0.0002, "epoch": 3.6264929424538543, "step": 5010}, {"loss": 1.2624, "grad_norm": 0.7062035202980042, "learning_rate": 0.0002, "epoch": 3.6337314513210277, "step": 5020}, {"loss": 1.0224, "grad_norm": 0.7081155776977539, "learning_rate": 0.0002, "epoch": 3.640969960188201, "step": 5030}, {"loss": 1.2195, "grad_norm": 1.2210607528686523, "learning_rate": 0.0002, "epoch": 3.6482084690553744, "step": 5040}, {"loss": 1.2596, "grad_norm": 0.6650236248970032, "learning_rate": 0.0002, "epoch": 3.655446977922548, "step": 5050}, {"loss": 1.1072, "grad_norm": 0.6884829998016357, "learning_rate": 0.0002, "epoch": 3.662685486789721, "step": 5060}, {"loss": 1.2292, "grad_norm": 0.7317819595336914, "learning_rate": 0.0002, "epoch": 3.6699239956568945, "step": 5070}, {"loss": 1.1917, "grad_norm": 0.7406691908836365, "learning_rate": 0.0002, "epoch": 3.677162504524068, "step": 5080}, {"loss": 1.2949, "grad_norm": 0.9009454250335693, "learning_rate": 0.0002, "epoch": 3.6844010133912413, "step": 5090}, {"loss": 1.1528, "grad_norm": 0.8189385533332825, "learning_rate": 0.0002, "epoch": 3.6916395222584146, "step": 5100}, {"loss": 1.3408, "grad_norm": 1.0793628692626953, "learning_rate": 0.0002, "epoch": 3.698878031125588, "step": 5110}, {"loss": 1.2417, "grad_norm": 0.8593027591705322, "learning_rate": 0.0002, "epoch": 3.7061165399927614, "step": 5120}, {"loss": 1.2141, "grad_norm": 0.8481812477111816, "learning_rate": 0.0002, "epoch": 3.7133550488599347, "step": 5130}, {"loss": 1.125, "grad_norm": 0.6527451276779175, "learning_rate": 0.0002, "epoch": 3.720593557727108, "step": 5140}, {"loss": 1.1584, "grad_norm": 0.9220114350318909, "learning_rate": 0.0002, "epoch": 3.7278320665942815, "step": 5150}, {"loss": 1.2267, "grad_norm": 1.0842019319534302, "learning_rate": 0.0002, "epoch": 3.735070575461455, "step": 5160}, {"loss": 1.3083, "grad_norm": 0.965453565120697, "learning_rate": 0.0002, "epoch": 3.742309084328628, "step": 5170}, {"loss": 1.1772, "grad_norm": 0.9903319478034973, "learning_rate": 0.0002, "epoch": 3.7495475931958016, "step": 5180}, {"loss": 1.2515, "grad_norm": 0.7434818148612976, "learning_rate": 0.0002, "epoch": 3.756786102062975, "step": 5190}, {"loss": 1.2631, "grad_norm": 0.6717280745506287, "learning_rate": 0.0002, "epoch": 3.7640246109301483, "step": 5200}, {"loss": 1.2012, "grad_norm": 0.7754665613174438, "learning_rate": 0.0002, "epoch": 3.7712631197973217, "step": 5210}, {"loss": 1.305, "grad_norm": 1.028374433517456, "learning_rate": 0.0002, "epoch": 3.778501628664495, "step": 5220}, {"loss": 1.1866, "grad_norm": 0.6026996374130249, "learning_rate": 0.0002, "epoch": 3.7857401375316684, "step": 5230}, {"loss": 1.1901, "grad_norm": 0.6978490948677063, "learning_rate": 0.0002, "epoch": 3.7929786463988417, "step": 5240}, {"loss": 1.2576, "grad_norm": 0.7303446531295776, "learning_rate": 0.0002, "epoch": 3.800217155266015, "step": 5250}, {"loss": 1.3173, "grad_norm": 1.0734210014343262, "learning_rate": 0.0002, "epoch": 3.8074556641331885, "step": 5260}, {"loss": 1.1137, "grad_norm": 0.6383201479911804, "learning_rate": 0.0002, "epoch": 3.814694173000362, "step": 5270}, {"loss": 1.0904, "grad_norm": 0.7742630243301392, "learning_rate": 0.0002, "epoch": 3.821932681867535, "step": 5280}, {"loss": 1.2232, "grad_norm": 0.8477074503898621, "learning_rate": 0.0002, "epoch": 3.8291711907347086, "step": 5290}, {"loss": 1.2047, "grad_norm": 0.6675317883491516, "learning_rate": 0.0002, "epoch": 3.836409699601882, "step": 5300}, {"loss": 1.2275, "grad_norm": 0.7515445351600647, "learning_rate": 0.0002, "epoch": 3.8436482084690553, "step": 5310}, {"loss": 1.2569, "grad_norm": 1.1441220045089722, "learning_rate": 0.0002, "epoch": 3.8508867173362287, "step": 5320}, {"loss": 1.1512, "grad_norm": 0.7968795895576477, "learning_rate": 0.0002, "epoch": 3.858125226203402, "step": 5330}, {"loss": 1.232, "grad_norm": 0.7842824459075928, "learning_rate": 0.0002, "epoch": 3.8653637350705754, "step": 5340}, {"loss": 1.1847, "grad_norm": 0.8272225260734558, "learning_rate": 0.0002, "epoch": 3.8726022439377488, "step": 5350}, {"loss": 1.1381, "grad_norm": 0.8413397669792175, "learning_rate": 0.0002, "epoch": 3.879840752804922, "step": 5360}, {"loss": 1.2349, "grad_norm": 1.141764760017395, "learning_rate": 0.0002, "epoch": 3.8870792616720955, "step": 5370}, {"loss": 1.212, "grad_norm": 0.9826975464820862, "learning_rate": 0.0002, "epoch": 3.894317770539269, "step": 5380}, {"loss": 1.1833, "grad_norm": 0.8598255515098572, "learning_rate": 0.0002, "epoch": 3.9015562794064422, "step": 5390}, {"loss": 1.1247, "grad_norm": 0.6271058320999146, "learning_rate": 0.0002, "epoch": 3.9087947882736156, "step": 5400}, {"loss": 1.2212, "grad_norm": 0.6379870772361755, "learning_rate": 0.0002, "epoch": 3.916033297140789, "step": 5410}, {"loss": 1.2481, "grad_norm": 1.0313376188278198, "learning_rate": 0.0002, "epoch": 3.9232718060079623, "step": 5420}, {"loss": 1.1872, "grad_norm": 0.8220619559288025, "learning_rate": 0.0002, "epoch": 3.9305103148751357, "step": 5430}, {"loss": 1.2006, "grad_norm": 0.7576116919517517, "learning_rate": 0.0002, "epoch": 3.937748823742309, "step": 5440}, {"loss": 1.1969, "grad_norm": 1.226235032081604, "learning_rate": 0.0002, "epoch": 3.9449873326094824, "step": 5450}, {"loss": 1.2945, "grad_norm": 0.7979229688644409, "learning_rate": 0.0002, "epoch": 3.952225841476656, "step": 5460}, {"loss": 1.1922, "grad_norm": 0.9911929965019226, "learning_rate": 0.0002, "epoch": 3.959464350343829, "step": 5470}, {"loss": 1.0924, "grad_norm": 0.643738865852356, "learning_rate": 0.0002, "epoch": 3.9667028592110025, "step": 5480}, {"loss": 1.0607, "grad_norm": 0.682305634021759, "learning_rate": 0.0002, "epoch": 3.973941368078176, "step": 5490}, {"loss": 1.2908, "grad_norm": 1.18373441696167, "learning_rate": 0.0002, "epoch": 3.9811798769453492, "step": 5500}, {"loss": 1.0889, "grad_norm": 0.7190203070640564, "learning_rate": 0.0002, "epoch": 3.9884183858125226, "step": 5510}, {"loss": 1.2745, "grad_norm": 0.7516948580741882, "learning_rate": 0.0002, "epoch": 3.995656894679696, "step": 5520}, {"eval_loss": 1.4252897500991821, "eval_runtime": 27.235, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 5526}, {"loss": 1.0088, "grad_norm": 0.6353074312210083, "learning_rate": 0.0002, "epoch": 4.002895403546869, "step": 5530}, {"loss": 1.0326, "grad_norm": 0.7424906492233276, "learning_rate": 0.0002, "epoch": 4.010133912414043, "step": 5540}, {"loss": 1.0667, "grad_norm": 0.8856638073921204, "learning_rate": 0.0002, "epoch": 4.017372421281216, "step": 5550}, {"loss": 1.0905, "grad_norm": 0.9627974033355713, "learning_rate": 0.0002, "epoch": 4.024610930148389, "step": 5560}, {"loss": 1.0965, "grad_norm": 0.9048978686332703, "learning_rate": 0.0002, "epoch": 4.031849439015563, "step": 5570}, {"loss": 1.1108, "grad_norm": 0.921119213104248, "learning_rate": 0.0002, "epoch": 4.039087947882736, "step": 5580}, {"loss": 1.1235, "grad_norm": 0.8654361963272095, "learning_rate": 0.0002, "epoch": 4.0463264567499095, "step": 5590}, {"loss": 1.0794, "grad_norm": 0.7947945594787598, "learning_rate": 0.0002, "epoch": 4.053564965617083, "step": 5600}, {"loss": 1.0674, "grad_norm": 0.8307326436042786, "learning_rate": 0.0002, "epoch": 4.060803474484256, "step": 5610}, {"loss": 1.0076, "grad_norm": 0.793273389339447, "learning_rate": 0.0002, "epoch": 4.06804198335143, "step": 5620}, {"loss": 1.0651, "grad_norm": 0.8748673796653748, "learning_rate": 0.0002, "epoch": 4.075280492218603, "step": 5630}, {"loss": 1.111, "grad_norm": 0.7926856279373169, "learning_rate": 0.0002, "epoch": 4.082519001085776, "step": 5640}, {"loss": 1.044, "grad_norm": 0.922645092010498, "learning_rate": 0.0002, "epoch": 4.08975750995295, "step": 5650}, {"loss": 1.109, "grad_norm": 0.9539641737937927, "learning_rate": 0.0002, "epoch": 4.096996018820123, "step": 5660}, {"loss": 1.0788, "grad_norm": 0.8674443364143372, "learning_rate": 0.0002, "epoch": 4.1042345276872965, "step": 5670}, {"loss": 0.9867, "grad_norm": 0.7097609043121338, "learning_rate": 0.0002, "epoch": 4.11147303655447, "step": 5680}, {"loss": 1.1154, "grad_norm": 0.8875522613525391, "learning_rate": 0.0002, "epoch": 4.118711545421643, "step": 5690}, {"loss": 1.1217, "grad_norm": 0.8583634495735168, "learning_rate": 0.0002, "epoch": 4.125950054288817, "step": 5700}, {"loss": 1.0973, "grad_norm": 0.6736377477645874, "learning_rate": 0.0002, "epoch": 4.13318856315599, "step": 5710}, {"loss": 1.1199, "grad_norm": 0.9349062442779541, "learning_rate": 0.0002, "epoch": 4.140427072023163, "step": 5720}, {"loss": 1.0508, "grad_norm": 1.0610365867614746, "learning_rate": 0.0002, "epoch": 4.147665580890337, "step": 5730}, {"loss": 1.1146, "grad_norm": 1.5838189125061035, "learning_rate": 0.0002, "epoch": 4.15490408975751, "step": 5740}, {"loss": 1.0222, "grad_norm": 0.747522234916687, "learning_rate": 0.0002, "epoch": 4.162142598624683, "step": 5750}, {"loss": 1.1328, "grad_norm": 1.3247915506362915, "learning_rate": 0.0002, "epoch": 4.169381107491857, "step": 5760}, {"loss": 1.1655, "grad_norm": 0.8750247955322266, "learning_rate": 0.0002, "epoch": 4.17661961635903, "step": 5770}, {"loss": 1.199, "grad_norm": 0.7914144992828369, "learning_rate": 0.0002, "epoch": 4.1838581252262035, "step": 5780}, {"loss": 1.1213, "grad_norm": 0.9493299126625061, "learning_rate": 0.0002, "epoch": 4.191096634093377, "step": 5790}, {"loss": 1.1515, "grad_norm": 0.7802295088768005, "learning_rate": 0.0002, "epoch": 4.19833514296055, "step": 5800}, {"loss": 1.0704, "grad_norm": 0.6987314820289612, "learning_rate": 0.0002, "epoch": 4.205573651827724, "step": 5810}, {"loss": 1.1699, "grad_norm": 0.9220341444015503, "learning_rate": 0.0002, "epoch": 4.212812160694897, "step": 5820}, {"loss": 1.1394, "grad_norm": 0.8932939767837524, "learning_rate": 0.0002, "epoch": 4.22005066956207, "step": 5830}, {"loss": 1.0048, "grad_norm": 0.920002818107605, "learning_rate": 0.0002, "epoch": 4.227289178429244, "step": 5840}, {"loss": 0.964, "grad_norm": 0.6662752032279968, "learning_rate": 0.0002, "epoch": 4.234527687296417, "step": 5850}, {"loss": 0.986, "grad_norm": 0.8679718971252441, "learning_rate": 0.0002, "epoch": 4.24176619616359, "step": 5860}, {"loss": 0.8991, "grad_norm": 0.7020887732505798, "learning_rate": 0.0002, "epoch": 4.249004705030764, "step": 5870}, {"loss": 1.1132, "grad_norm": 0.869611382484436, "learning_rate": 0.0002, "epoch": 4.256243213897937, "step": 5880}, {"loss": 1.1026, "grad_norm": 0.7796585559844971, "learning_rate": 0.0002, "epoch": 4.2634817227651105, "step": 5890}, {"loss": 1.0957, "grad_norm": 0.8978819251060486, "learning_rate": 0.0002, "epoch": 4.270720231632284, "step": 5900}, {"loss": 1.1325, "grad_norm": 1.0837205648422241, "learning_rate": 0.0002, "epoch": 4.277958740499457, "step": 5910}, {"loss": 1.1279, "grad_norm": 0.7584353089332581, "learning_rate": 0.0002, "epoch": 4.285197249366631, "step": 5920}, {"loss": 1.0513, "grad_norm": 0.7313185334205627, "learning_rate": 0.0002, "epoch": 4.292435758233804, "step": 5930}, {"loss": 1.1101, "grad_norm": 0.8004671335220337, "learning_rate": 0.0002, "epoch": 4.299674267100977, "step": 5940}, {"loss": 1.14, "grad_norm": 2.154958724975586, "learning_rate": 0.0002, "epoch": 4.306912775968151, "step": 5950}, {"loss": 1.1206, "grad_norm": 0.9163479804992676, "learning_rate": 0.0002, "epoch": 4.314151284835324, "step": 5960}, {"loss": 0.9941, "grad_norm": 0.9151589274406433, "learning_rate": 0.0002, "epoch": 4.321389793702497, "step": 5970}, {"loss": 1.0606, "grad_norm": 0.8624112010002136, "learning_rate": 0.0002, "epoch": 4.328628302569671, "step": 5980}, {"loss": 1.1625, "grad_norm": 0.9357741475105286, "learning_rate": 0.0002, "epoch": 4.335866811436844, "step": 5990}, {"loss": 1.0712, "grad_norm": 1.3482335805892944, "learning_rate": 0.0002, "epoch": 4.3431053203040175, "step": 6000}, {"loss": 1.1224, "grad_norm": 0.7156149744987488, "learning_rate": 0.0002, "epoch": 4.350343829171191, "step": 6010}, {"loss": 1.0753, "grad_norm": 0.8480049967765808, "learning_rate": 0.0002, "epoch": 4.357582338038364, "step": 6020}, {"loss": 1.051, "grad_norm": 0.8262244462966919, "learning_rate": 0.0002, "epoch": 4.364820846905538, "step": 6030}, {"loss": 0.9966, "grad_norm": 0.7733905911445618, "learning_rate": 0.0002, "epoch": 4.372059355772711, "step": 6040}, {"loss": 1.1008, "grad_norm": 0.8553919792175293, "learning_rate": 0.0002, "epoch": 4.379297864639884, "step": 6050}, {"loss": 1.1777, "grad_norm": 0.8666832447052002, "learning_rate": 0.0002, "epoch": 4.386536373507058, "step": 6060}, {"loss": 1.1934, "grad_norm": 0.9168295860290527, "learning_rate": 0.0002, "epoch": 4.393774882374231, "step": 6070}, {"loss": 1.0988, "grad_norm": 0.7315238118171692, "learning_rate": 0.0002, "epoch": 4.4010133912414044, "step": 6080}, {"loss": 1.1599, "grad_norm": 1.020263433456421, "learning_rate": 0.0002, "epoch": 4.408251900108578, "step": 6090}, {"loss": 1.133, "grad_norm": 0.9978243708610535, "learning_rate": 0.0002, "epoch": 4.415490408975751, "step": 6100}, {"loss": 1.1324, "grad_norm": 0.995453953742981, "learning_rate": 0.0002, "epoch": 4.4227289178429245, "step": 6110}, {"loss": 1.0957, "grad_norm": 0.9360884428024292, "learning_rate": 0.0002, "epoch": 4.429967426710098, "step": 6120}, {"loss": 0.9506, "grad_norm": 0.8099448084831238, "learning_rate": 0.0002, "epoch": 4.437205935577271, "step": 6130}, {"loss": 1.0887, "grad_norm": 0.8173841238021851, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6140}, {"loss": 1.1219, "grad_norm": 0.7972666025161743, "learning_rate": 0.0002, "epoch": 4.451682953311618, "step": 6150}, {"loss": 1.0226, "grad_norm": 0.7685779333114624, "learning_rate": 0.0002, "epoch": 4.458921462178791, "step": 6160}, {"loss": 1.0732, "grad_norm": 0.7872623801231384, "learning_rate": 0.0002, "epoch": 4.466159971045965, "step": 6170}, {"loss": 0.9911, "grad_norm": 0.7677070498466492, "learning_rate": 0.0002, "epoch": 4.473398479913138, "step": 6180}, {"loss": 1.0919, "grad_norm": 0.7878316044807434, "learning_rate": 0.0002, "epoch": 4.4806369887803115, "step": 6190}, {"loss": 1.018, "grad_norm": 0.8178079724311829, "learning_rate": 0.0002, "epoch": 4.487875497647485, "step": 6200}, {"loss": 1.0517, "grad_norm": 1.2820082902908325, "learning_rate": 0.0002, "epoch": 4.495114006514658, "step": 6210}, {"loss": 1.3101, "grad_norm": 0.9380832314491272, "learning_rate": 0.0002, "epoch": 4.502352515381832, "step": 6220}, {"loss": 0.9818, "grad_norm": 0.7810422778129578, "learning_rate": 0.0002, "epoch": 4.509591024249005, "step": 6230}, {"loss": 1.1677, "grad_norm": 1.1022917032241821, "learning_rate": 0.0002, "epoch": 4.516829533116178, "step": 6240}, {"loss": 1.1579, "grad_norm": 1.4275553226470947, "learning_rate": 0.0002, "epoch": 4.524068041983352, "step": 6250}, {"loss": 1.3237, "grad_norm": 0.7597777247428894, "learning_rate": 0.0002, "epoch": 4.531306550850525, "step": 6260}, {"loss": 1.1529, "grad_norm": 1.10992431640625, "learning_rate": 0.0002, "epoch": 4.538545059717698, "step": 6270}, {"loss": 1.0732, "grad_norm": 0.8981178998947144, "learning_rate": 0.0002, "epoch": 4.545783568584872, "step": 6280}, {"loss": 1.086, "grad_norm": 0.7863979339599609, "learning_rate": 0.0002, "epoch": 4.553022077452045, "step": 6290}, {"loss": 1.2008, "grad_norm": 0.9071474671363831, "learning_rate": 0.0002, "epoch": 4.5602605863192185, "step": 6300}, {"loss": 1.0916, "grad_norm": 0.7429424524307251, "learning_rate": 0.0002, "epoch": 4.567499095186392, "step": 6310}, {"loss": 1.095, "grad_norm": 1.0767850875854492, "learning_rate": 0.0002, "epoch": 4.574737604053565, "step": 6320}, {"loss": 1.1023, "grad_norm": 0.7885915637016296, "learning_rate": 0.0002, "epoch": 4.581976112920739, "step": 6330}, {"loss": 1.1131, "grad_norm": 0.8350457549095154, "learning_rate": 0.0002, "epoch": 4.589214621787912, "step": 6340}, {"loss": 1.0743, "grad_norm": 0.7853530645370483, "learning_rate": 0.0002, "epoch": 4.596453130655085, "step": 6350}, {"loss": 1.1912, "grad_norm": 1.1220661401748657, "learning_rate": 0.0002, "epoch": 4.603691639522259, "step": 6360}, {"loss": 1.0927, "grad_norm": 0.7959423065185547, "learning_rate": 0.0002, "epoch": 4.610930148389432, "step": 6370}, {"loss": 1.1542, "grad_norm": 0.7782652378082275, "learning_rate": 0.0002, "epoch": 4.618168657256605, "step": 6380}, {"loss": 1.0753, "grad_norm": 0.7882203459739685, "learning_rate": 0.0002, "epoch": 4.625407166123779, "step": 6390}, {"loss": 1.0676, "grad_norm": 0.8841899037361145, "learning_rate": 0.0002, "epoch": 4.632645674990952, "step": 6400}, {"loss": 1.0815, "grad_norm": 0.7936127781867981, "learning_rate": 0.0002, "epoch": 4.6398841838581255, "step": 6410}, {"loss": 1.0198, "grad_norm": 0.9213966131210327, "learning_rate": 0.0002, "epoch": 4.647122692725299, "step": 6420}, {"loss": 0.9872, "grad_norm": 0.9246473908424377, "learning_rate": 0.0002, "epoch": 4.654361201592472, "step": 6430}, {"loss": 1.1309, "grad_norm": 0.766572892665863, "learning_rate": 0.0002, "epoch": 4.661599710459646, "step": 6440}, {"loss": 1.1095, "grad_norm": 0.8596171736717224, "learning_rate": 0.0002, "epoch": 4.668838219326819, "step": 6450}, {"loss": 1.1869, "grad_norm": 0.8482751846313477, "learning_rate": 0.0002, "epoch": 4.676076728193992, "step": 6460}, {"loss": 1.0622, "grad_norm": 1.0826905965805054, "learning_rate": 0.0002, "epoch": 4.683315237061166, "step": 6470}, {"loss": 1.0256, "grad_norm": 1.1048457622528076, "learning_rate": 0.0002, "epoch": 4.690553745928339, "step": 6480}, {"loss": 1.0514, "grad_norm": 0.9429134726524353, "learning_rate": 0.0002, "epoch": 4.697792254795512, "step": 6490}, {"loss": 1.1351, "grad_norm": 0.8587502837181091, "learning_rate": 0.0002, "epoch": 4.705030763662686, "step": 6500}, {"loss": 1.0969, "grad_norm": 1.0387083292007446, "learning_rate": 0.0002, "epoch": 4.712269272529859, "step": 6510}, {"loss": 1.0493, "grad_norm": 0.7471951842308044, "learning_rate": 0.0002, "epoch": 4.7195077813970325, "step": 6520}, {"loss": 1.2632, "grad_norm": 0.8800424933433533, "learning_rate": 0.0002, "epoch": 4.726746290264206, "step": 6530}, {"loss": 1.2126, "grad_norm": 0.8136811852455139, "learning_rate": 0.0002, "epoch": 4.733984799131379, "step": 6540}, {"loss": 1.195, "grad_norm": 0.9910339713096619, "learning_rate": 0.0002, "epoch": 4.741223307998553, "step": 6550}, {"loss": 1.1201, "grad_norm": 1.0679163932800293, "learning_rate": 0.0002, "epoch": 4.748461816865726, "step": 6560}, {"loss": 1.0297, "grad_norm": 0.8468248248100281, "learning_rate": 0.0002, "epoch": 4.755700325732899, "step": 6570}, {"loss": 1.0858, "grad_norm": 0.8771235942840576, "learning_rate": 0.0002, "epoch": 4.762938834600073, "step": 6580}, {"loss": 1.077, "grad_norm": 0.7024846076965332, "learning_rate": 0.0002, "epoch": 4.770177343467246, "step": 6590}, {"loss": 1.0876, "grad_norm": 0.7836683392524719, "learning_rate": 0.0002, "epoch": 4.7774158523344195, "step": 6600}, {"loss": 1.1006, "grad_norm": 0.7717288136482239, "learning_rate": 0.0002, "epoch": 4.784654361201593, "step": 6610}, {"loss": 1.0376, "grad_norm": 0.884183943271637, "learning_rate": 0.0002, "epoch": 4.791892870068766, "step": 6620}, {"loss": 1.1757, "grad_norm": 1.383867621421814, "learning_rate": 0.0002, "epoch": 4.7991313789359396, "step": 6630}, {"loss": 1.0861, "grad_norm": 0.9741523861885071, "learning_rate": 0.0002, "epoch": 4.806369887803113, "step": 6640}, {"loss": 1.0884, "grad_norm": 0.9723693132400513, "learning_rate": 0.0002, "epoch": 4.813608396670286, "step": 6650}, {"loss": 1.2203, "grad_norm": 1.8324809074401855, "learning_rate": 0.0002, "epoch": 4.82084690553746, "step": 6660}, {"loss": 1.0292, "grad_norm": 0.904909074306488, "learning_rate": 0.0002, "epoch": 4.828085414404633, "step": 6670}, {"loss": 1.0349, "grad_norm": 0.7355411648750305, "learning_rate": 0.0002, "epoch": 4.835323923271806, "step": 6680}, {"loss": 1.0793, "grad_norm": 0.8934960961341858, "learning_rate": 0.0002, "epoch": 4.84256243213898, "step": 6690}, {"loss": 1.0375, "grad_norm": 1.4596954584121704, "learning_rate": 0.0002, "epoch": 4.849800941006153, "step": 6700}, {"loss": 1.1065, "grad_norm": 0.8310341238975525, "learning_rate": 0.0002, "epoch": 4.8570394498733265, "step": 6710}, {"loss": 1.1089, "grad_norm": 0.9709894061088562, "learning_rate": 0.0002, "epoch": 4.8642779587405, "step": 6720}, {"loss": 1.0069, "grad_norm": 0.852142333984375, "learning_rate": 0.0002, "epoch": 4.871516467607673, "step": 6730}, {"loss": 1.0507, "grad_norm": 1.0643625259399414, "learning_rate": 0.0002, "epoch": 4.878754976474847, "step": 6740}, {"loss": 1.056, "grad_norm": 0.9419508576393127, "learning_rate": 0.0002, "epoch": 4.88599348534202, "step": 6750}, {"loss": 1.1995, "grad_norm": 1.1818498373031616, "learning_rate": 0.0002, "epoch": 4.893231994209193, "step": 6760}, {"loss": 1.0925, "grad_norm": 0.9369569420814514, "learning_rate": 0.0002, "epoch": 4.900470503076367, "step": 6770}, {"loss": 1.1648, "grad_norm": 0.7012579441070557, "learning_rate": 0.0002, "epoch": 4.90770901194354, "step": 6780}, {"loss": 1.0926, "grad_norm": 0.9109319448471069, "learning_rate": 0.0002, "epoch": 4.914947520810713, "step": 6790}, {"loss": 1.0358, "grad_norm": 0.8077534437179565, "learning_rate": 0.0002, "epoch": 4.922186029677887, "step": 6800}, {"loss": 1.2549, "grad_norm": 0.7571148872375488, "learning_rate": 0.0002, "epoch": 4.92942453854506, "step": 6810}, {"loss": 0.9638, "grad_norm": 0.7325633764266968, "learning_rate": 0.0002, "epoch": 4.9366630474122335, "step": 6820}, {"loss": 1.0128, "grad_norm": 0.8465084433555603, "learning_rate": 0.0002, "epoch": 4.943901556279407, "step": 6830}, {"loss": 1.153, "grad_norm": 0.8753737807273865, "learning_rate": 0.0002, "epoch": 4.95114006514658, "step": 6840}, {"loss": 1.0247, "grad_norm": 0.9421748518943787, "learning_rate": 0.0002, "epoch": 4.958378574013754, "step": 6850}, {"loss": 1.1483, "grad_norm": 0.8245896697044373, "learning_rate": 0.0002, "epoch": 4.965617082880927, "step": 6860}, {"loss": 0.9905, "grad_norm": 0.8823089599609375, "learning_rate": 0.0002, "epoch": 4.9728555917481, "step": 6870}, {"loss": 1.1664, "grad_norm": 0.8406389355659485, "learning_rate": 0.0002, "epoch": 4.980094100615274, "step": 6880}, {"loss": 1.0944, "grad_norm": 0.9732868075370789, "learning_rate": 0.0002, "epoch": 4.987332609482447, "step": 6890}, {"loss": 1.1776, "grad_norm": 2.125141143798828, "learning_rate": 0.0002, "epoch": 4.99457111834962, "step": 6900}, {"eval_loss": 1.445176601409912, "eval_runtime": 27.2351, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.999638074556641, "step": 6907}, {"loss": 1.1362, "grad_norm": 0.9465792775154114, "learning_rate": 0.0002, "epoch": 5.001809627216793, "step": 6910}, {"loss": 0.982, "grad_norm": 1.2834891080856323, "learning_rate": 0.0002, "epoch": 5.009048136083966, "step": 6920}, {"loss": 0.9803, "grad_norm": 1.0297378301620483, "learning_rate": 0.0002, "epoch": 5.01628664495114, "step": 6930}, {"loss": 1.0447, "grad_norm": 1.1705161333084106, "learning_rate": 0.0002, "epoch": 5.023525153818313, "step": 6940}, {"loss": 1.0113, "grad_norm": 0.8293961882591248, "learning_rate": 0.0002, "epoch": 5.030763662685486, "step": 6950}, {"loss": 0.9203, "grad_norm": 1.0422210693359375, "learning_rate": 0.0002, "epoch": 5.03800217155266, "step": 6960}, {"loss": 1.0553, "grad_norm": 1.116104245185852, "learning_rate": 0.0002, "epoch": 5.045240680419833, "step": 6970}, {"loss": 0.9011, "grad_norm": 1.5118416547775269, "learning_rate": 0.0002, "epoch": 5.0524791892870065, "step": 6980}, {"loss": 0.9969, "grad_norm": 0.8383979797363281, "learning_rate": 0.0002, "epoch": 5.05971769815418, "step": 6990}, {"loss": 0.9659, "grad_norm": 1.3378649950027466, "learning_rate": 0.0002, "epoch": 5.066956207021353, "step": 7000}, {"loss": 1.0212, "grad_norm": 1.1840510368347168, "learning_rate": 0.0002, "epoch": 5.0741947158885266, "step": 7010}, {"loss": 0.9939, "grad_norm": 1.2354751825332642, "learning_rate": 0.0002, "epoch": 5.0814332247557, "step": 7020}, {"loss": 0.9831, "grad_norm": 1.3830451965332031, "learning_rate": 0.0002, "epoch": 5.088671733622873, "step": 7030}, {"loss": 1.1827, "grad_norm": 0.8101674318313599, "learning_rate": 0.0002, "epoch": 5.095910242490047, "step": 7040}, {"loss": 0.9255, "grad_norm": 0.897982656955719, "learning_rate": 0.0002, "epoch": 5.10314875135722, "step": 7050}, {"loss": 0.8784, "grad_norm": 1.2049678564071655, "learning_rate": 0.0002, "epoch": 5.110387260224393, "step": 7060}, {"loss": 1.0182, "grad_norm": 1.5912116765975952, "learning_rate": 0.0002, "epoch": 5.117625769091567, "step": 7070}, {"loss": 1.0909, "grad_norm": 0.9261530041694641, "learning_rate": 0.0002, "epoch": 5.12486427795874, "step": 7080}, {"loss": 0.9603, "grad_norm": 1.1454812288284302, "learning_rate": 0.0002, "epoch": 5.1321027868259135, "step": 7090}, {"loss": 0.9149, "grad_norm": 1.0049978494644165, "learning_rate": 0.0002, "epoch": 5.139341295693087, "step": 7100}, {"loss": 0.9463, "grad_norm": 1.4513251781463623, "learning_rate": 0.0002, "epoch": 5.14657980456026, "step": 7110}, {"loss": 0.8995, "grad_norm": 0.9800849556922913, "learning_rate": 0.0002, "epoch": 5.153818313427434, "step": 7120}, {"loss": 0.9835, "grad_norm": 0.9698708653450012, "learning_rate": 0.0002, "epoch": 5.161056822294607, "step": 7130}, {"loss": 0.9672, "grad_norm": 1.1126646995544434, "learning_rate": 0.0002, "epoch": 5.16829533116178, "step": 7140}, {"loss": 0.9384, "grad_norm": 0.9248330593109131, "learning_rate": 0.0002, "epoch": 5.175533840028954, "step": 7150}, {"loss": 0.826, "grad_norm": 0.7967255711555481, "learning_rate": 0.0002, "epoch": 5.182772348896127, "step": 7160}, {"loss": 1.0078, "grad_norm": 0.9933333992958069, "learning_rate": 0.0002, "epoch": 5.1900108577633, "step": 7170}, {"loss": 1.0276, "grad_norm": 1.0080649852752686, "learning_rate": 0.0002, "epoch": 5.197249366630474, "step": 7180}, {"loss": 1.0201, "grad_norm": 1.3954921960830688, "learning_rate": 0.0002, "epoch": 5.204487875497647, "step": 7190}, {"loss": 1.0863, "grad_norm": 1.2386271953582764, "learning_rate": 0.0002, "epoch": 5.2117263843648205, "step": 7200}, {"loss": 0.8863, "grad_norm": 1.2379488945007324, "learning_rate": 0.0002, "epoch": 5.218964893231994, "step": 7210}, {"loss": 1.0518, "grad_norm": 0.9882503747940063, "learning_rate": 0.0002, "epoch": 5.226203402099167, "step": 7220}, {"loss": 0.9834, "grad_norm": 1.1728729009628296, "learning_rate": 0.0002, "epoch": 5.233441910966341, "step": 7230}, {"loss": 0.9269, "grad_norm": 0.9849673509597778, "learning_rate": 0.0002, "epoch": 5.240680419833514, "step": 7240}, {"loss": 0.9935, "grad_norm": 1.177639365196228, "learning_rate": 0.0002, "epoch": 5.247918928700687, "step": 7250}, {"loss": 1.0639, "grad_norm": 1.2395055294036865, "learning_rate": 0.0002, "epoch": 5.255157437567861, "step": 7260}, {"loss": 1.0138, "grad_norm": 1.3999171257019043, "learning_rate": 0.0002, "epoch": 5.262395946435034, "step": 7270}, {"loss": 0.9745, "grad_norm": 0.7698732018470764, "learning_rate": 0.0002, "epoch": 5.269634455302207, "step": 7280}, {"loss": 1.0389, "grad_norm": 0.9167453646659851, "learning_rate": 0.0002, "epoch": 5.276872964169381, "step": 7290}, {"loss": 0.9858, "grad_norm": 1.113830804824829, "learning_rate": 0.0002, "epoch": 5.284111473036554, "step": 7300}, {"loss": 0.9577, "grad_norm": 0.9644396901130676, "learning_rate": 0.0002, "epoch": 5.2913499819037275, "step": 7310}, {"loss": 1.0556, "grad_norm": 1.462435007095337, "learning_rate": 0.0002, "epoch": 5.298588490770901, "step": 7320}, {"loss": 0.871, "grad_norm": 0.9406287670135498, "learning_rate": 0.0002, "epoch": 5.305826999638074, "step": 7330}, {"loss": 1.0022, "grad_norm": 0.9698247909545898, "learning_rate": 0.0002, "epoch": 5.313065508505248, "step": 7340}, {"loss": 0.915, "grad_norm": 1.12003755569458, "learning_rate": 0.0002, "epoch": 5.320304017372421, "step": 7350}, {"loss": 0.9838, "grad_norm": 1.598681926727295, "learning_rate": 0.0002, "epoch": 5.327542526239594, "step": 7360}, {"loss": 1.0, "grad_norm": 1.0450010299682617, "learning_rate": 0.0002, "epoch": 5.334781035106768, "step": 7370}, {"loss": 0.9983, "grad_norm": 0.8680008053779602, "learning_rate": 0.0002, "epoch": 5.342019543973941, "step": 7380}, {"loss": 0.9851, "grad_norm": 1.0115476846694946, "learning_rate": 0.0002, "epoch": 5.349258052841114, "step": 7390}, {"loss": 1.0702, "grad_norm": 0.9589748382568359, "learning_rate": 0.0002, "epoch": 5.356496561708288, "step": 7400}, {"loss": 0.9366, "grad_norm": 0.6729998588562012, "learning_rate": 0.0002, "epoch": 5.363735070575461, "step": 7410}, {"loss": 1.0126, "grad_norm": 0.9246699213981628, "learning_rate": 0.0002, "epoch": 5.3709735794426345, "step": 7420}, {"loss": 0.9815, "grad_norm": 1.1266791820526123, "learning_rate": 0.0002, "epoch": 5.378212088309808, "step": 7430}, {"loss": 1.1166, "grad_norm": 1.8056942224502563, "learning_rate": 0.0002, "epoch": 5.385450597176981, "step": 7440}, {"loss": 0.9604, "grad_norm": 0.9802932739257812, "learning_rate": 0.0002, "epoch": 5.392689106044155, "step": 7450}, {"loss": 0.9656, "grad_norm": 1.0504707098007202, "learning_rate": 0.0002, "epoch": 5.399927614911328, "step": 7460}, {"loss": 1.0132, "grad_norm": 1.1915022134780884, "learning_rate": 0.0002, "epoch": 5.407166123778501, "step": 7470}, {"loss": 1.0041, "grad_norm": 1.1856611967086792, "learning_rate": 0.0002, "epoch": 5.414404632645675, "step": 7480}, {"loss": 0.9747, "grad_norm": 1.292152762413025, "learning_rate": 0.0002, "epoch": 5.421643141512848, "step": 7490}, {"loss": 0.9659, "grad_norm": 1.2675740718841553, "learning_rate": 0.0002, "epoch": 5.4288816503800215, "step": 7500}, {"loss": 1.0271, "grad_norm": 1.4034695625305176, "learning_rate": 0.0002, "epoch": 5.436120159247195, "step": 7510}, {"loss": 1.0318, "grad_norm": 0.984588623046875, "learning_rate": 0.0002, "epoch": 5.443358668114368, "step": 7520}, {"loss": 1.0726, "grad_norm": 0.8419108390808105, "learning_rate": 0.0002, "epoch": 5.450597176981542, "step": 7530}, {"loss": 1.0499, "grad_norm": 1.0270143747329712, "learning_rate": 0.0002, "epoch": 5.457835685848715, "step": 7540}, {"loss": 0.9804, "grad_norm": 2.2158689498901367, "learning_rate": 0.0002, "epoch": 5.465074194715888, "step": 7550}, {"loss": 0.9856, "grad_norm": 1.0740524530410767, "learning_rate": 0.0002, "epoch": 5.472312703583062, "step": 7560}, {"loss": 1.0522, "grad_norm": 1.3804482221603394, "learning_rate": 0.0002, "epoch": 5.479551212450235, "step": 7570}, {"loss": 1.0297, "grad_norm": 0.9428979754447937, "learning_rate": 0.0002, "epoch": 5.486789721317408, "step": 7580}, {"loss": 1.0906, "grad_norm": 0.9548295736312866, "learning_rate": 0.0002, "epoch": 5.494028230184582, "step": 7590}, {"loss": 0.8853, "grad_norm": 1.0691065788269043, "learning_rate": 0.0002, "epoch": 5.501266739051755, "step": 7600}, {"loss": 1.0375, "grad_norm": 1.0987380743026733, "learning_rate": 0.0002, "epoch": 5.5085052479189285, "step": 7610}, {"loss": 1.0162, "grad_norm": 0.9483979344367981, "learning_rate": 0.0002, "epoch": 5.515743756786102, "step": 7620}, {"loss": 1.105, "grad_norm": 1.16624915599823, "learning_rate": 0.0002, "epoch": 5.522982265653275, "step": 7630}, {"loss": 0.8695, "grad_norm": 0.8563777208328247, "learning_rate": 0.0002, "epoch": 5.530220774520449, "step": 7640}, {"loss": 0.9297, "grad_norm": 1.268186092376709, "learning_rate": 0.0002, "epoch": 5.537459283387622, "step": 7650}, {"loss": 1.1152, "grad_norm": 1.0752092599868774, "learning_rate": 0.0002, "epoch": 5.544697792254795, "step": 7660}, {"loss": 0.9344, "grad_norm": 1.210389256477356, "learning_rate": 0.0002, "epoch": 5.551936301121969, "step": 7670}, {"loss": 1.0349, "grad_norm": 1.669063925743103, "learning_rate": 0.0002, "epoch": 5.559174809989142, "step": 7680}, {"loss": 0.9833, "grad_norm": 1.038020133972168, "learning_rate": 0.0002, "epoch": 5.566413318856315, "step": 7690}, {"loss": 0.8907, "grad_norm": 1.316673994064331, "learning_rate": 0.0002, "epoch": 5.573651827723489, "step": 7700}, {"loss": 0.9614, "grad_norm": 1.029935359954834, "learning_rate": 0.0002, "epoch": 5.580890336590662, "step": 7710}, {"loss": 1.0409, "grad_norm": 0.9401940703392029, "learning_rate": 0.0002, "epoch": 5.5881288454578355, "step": 7720}, {"loss": 0.9272, "grad_norm": 2.4811816215515137, "learning_rate": 0.0002, "epoch": 5.595367354325009, "step": 7730}, {"loss": 0.992, "grad_norm": 1.0329105854034424, "learning_rate": 0.0002, "epoch": 5.602605863192182, "step": 7740}, {"loss": 0.9493, "grad_norm": 1.479629635810852, "learning_rate": 0.0002, "epoch": 5.609844372059356, "step": 7750}, {"loss": 1.0727, "grad_norm": 1.9232319593429565, "learning_rate": 0.0002, "epoch": 5.617082880926529, "step": 7760}, {"loss": 1.0741, "grad_norm": 1.0055509805679321, "learning_rate": 0.0002, "epoch": 5.624321389793702, "step": 7770}, {"loss": 1.0731, "grad_norm": 1.0037437677383423, "learning_rate": 0.0002, "epoch": 5.631559898660876, "step": 7780}, {"loss": 1.0913, "grad_norm": 1.4245030879974365, "learning_rate": 0.0002, "epoch": 5.638798407528049, "step": 7790}, {"loss": 0.9711, "grad_norm": 1.080687403678894, "learning_rate": 0.0002, "epoch": 5.646036916395222, "step": 7800}, {"loss": 1.0276, "grad_norm": 1.354953408241272, "learning_rate": 0.0002, "epoch": 5.653275425262396, "step": 7810}, {"loss": 1.0534, "grad_norm": 0.8966761231422424, "learning_rate": 0.0002, "epoch": 5.660513934129569, "step": 7820}, {"loss": 1.0662, "grad_norm": 1.0675480365753174, "learning_rate": 0.0002, "epoch": 5.6677524429967425, "step": 7830}, {"loss": 1.1077, "grad_norm": 1.2104216814041138, "learning_rate": 0.0002, "epoch": 5.674990951863916, "step": 7840}, {"loss": 0.9627, "grad_norm": 1.105790376663208, "learning_rate": 0.0002, "epoch": 5.682229460731089, "step": 7850}, {"loss": 1.0483, "grad_norm": 1.0915391445159912, "learning_rate": 0.0002, "epoch": 5.689467969598263, "step": 7860}, {"loss": 1.0291, "grad_norm": 0.8957812786102295, "learning_rate": 0.0002, "epoch": 5.696706478465436, "step": 7870}, {"loss": 0.9785, "grad_norm": 1.9189311265945435, "learning_rate": 0.0002, "epoch": 5.703944987332609, "step": 7880}, {"loss": 1.0076, "grad_norm": 1.0867321491241455, "learning_rate": 0.0002, "epoch": 5.711183496199783, "step": 7890}, {"loss": 1.0236, "grad_norm": 1.0233147144317627, "learning_rate": 0.0002, "epoch": 5.718422005066956, "step": 7900}, {"loss": 0.9872, "grad_norm": 1.16460382938385, "learning_rate": 0.0002, "epoch": 5.7256605139341294, "step": 7910}, {"loss": 1.0762, "grad_norm": 1.1098358631134033, "learning_rate": 0.0002, "epoch": 5.732899022801303, "step": 7920}, {"loss": 0.9937, "grad_norm": 0.8555701375007629, "learning_rate": 0.0002, "epoch": 5.740137531668476, "step": 7930}, {"loss": 1.0081, "grad_norm": 0.9885705709457397, "learning_rate": 0.0002, "epoch": 5.7473760405356495, "step": 7940}, {"loss": 0.9909, "grad_norm": 0.9184203147888184, "learning_rate": 0.0002, "epoch": 5.754614549402823, "step": 7950}, {"loss": 1.0767, "grad_norm": 0.9653698205947876, "learning_rate": 0.0002, "epoch": 5.761853058269996, "step": 7960}, {"loss": 0.9317, "grad_norm": 1.0014251470565796, "learning_rate": 0.0002, "epoch": 5.76909156713717, "step": 7970}, {"loss": 1.0271, "grad_norm": 1.004701018333435, "learning_rate": 0.0002, "epoch": 5.776330076004343, "step": 7980}, {"loss": 1.0397, "grad_norm": 0.950577974319458, "learning_rate": 0.0002, "epoch": 5.783568584871516, "step": 7990}, {"loss": 0.9725, "grad_norm": 1.2986834049224854, "learning_rate": 0.0002, "epoch": 5.79080709373869, "step": 8000}, {"loss": 1.039, "grad_norm": 1.3353424072265625, "learning_rate": 0.0002, "epoch": 5.798045602605863, "step": 8010}, {"loss": 1.0626, "grad_norm": 0.7650562524795532, "learning_rate": 0.0002, "epoch": 5.8052841114730365, "step": 8020}, {"loss": 1.0802, "grad_norm": 1.0156235694885254, "learning_rate": 0.0002, "epoch": 5.81252262034021, "step": 8030}, {"loss": 1.0185, "grad_norm": 1.3092900514602661, "learning_rate": 0.0002, "epoch": 5.819761129207383, "step": 8040}, {"loss": 0.9905, "grad_norm": 1.184428095817566, "learning_rate": 0.0002, "epoch": 5.826999638074557, "step": 8050}, {"loss": 1.0548, "grad_norm": 0.979401707649231, "learning_rate": 0.0002, "epoch": 5.83423814694173, "step": 8060}, {"loss": 0.9721, "grad_norm": 1.3557400703430176, "learning_rate": 0.0002, "epoch": 5.841476655808903, "step": 8070}, {"loss": 1.0235, "grad_norm": 0.8429333567619324, "learning_rate": 0.0002, "epoch": 5.848715164676077, "step": 8080}, {"loss": 0.952, "grad_norm": 1.3167692422866821, "learning_rate": 0.0002, "epoch": 5.85595367354325, "step": 8090}, {"loss": 0.9609, "grad_norm": 0.9750998020172119, "learning_rate": 0.0002, "epoch": 5.863192182410423, "step": 8100}, {"loss": 1.0789, "grad_norm": 1.1869813203811646, "learning_rate": 0.0002, "epoch": 5.870430691277597, "step": 8110}, {"loss": 1.0331, "grad_norm": 1.508615255355835, "learning_rate": 0.0002, "epoch": 5.87766920014477, "step": 8120}, {"loss": 1.0171, "grad_norm": 0.9439908266067505, "learning_rate": 0.0002, "epoch": 5.8849077090119435, "step": 8130}, {"loss": 0.9682, "grad_norm": 0.910508930683136, "learning_rate": 0.0002, "epoch": 5.892146217879117, "step": 8140}, {"loss": 1.0032, "grad_norm": 1.111501932144165, "learning_rate": 0.0002, "epoch": 5.89938472674629, "step": 8150}, {"loss": 1.0266, "grad_norm": 0.726554274559021, "learning_rate": 0.0002, "epoch": 5.906623235613464, "step": 8160}, {"loss": 1.0681, "grad_norm": 1.1084556579589844, "learning_rate": 0.0002, "epoch": 5.913861744480637, "step": 8170}, {"loss": 0.969, "grad_norm": 0.9695167541503906, "learning_rate": 0.0002, "epoch": 5.92110025334781, "step": 8180}, {"loss": 0.9858, "grad_norm": 1.1169592142105103, "learning_rate": 0.0002, "epoch": 5.928338762214984, "step": 8190}, {"loss": 1.0924, "grad_norm": 1.5116780996322632, "learning_rate": 0.0002, "epoch": 5.935577271082157, "step": 8200}, {"loss": 0.878, "grad_norm": 1.0073388814926147, "learning_rate": 0.0002, "epoch": 5.94281577994933, "step": 8210}, {"loss": 1.0462, "grad_norm": 0.9323263168334961, "learning_rate": 0.0002, "epoch": 5.950054288816504, "step": 8220}, {"loss": 1.0291, "grad_norm": 0.9422887563705444, "learning_rate": 0.0002, "epoch": 5.957292797683677, "step": 8230}, {"loss": 0.953, "grad_norm": 0.9691047668457031, "learning_rate": 0.0002, "epoch": 5.9645313065508505, "step": 8240}, {"loss": 0.9842, "grad_norm": 0.9650622606277466, "learning_rate": 0.0002, "epoch": 5.971769815418024, "step": 8250}, {"loss": 0.907, "grad_norm": 1.077958345413208, "learning_rate": 0.0002, "epoch": 5.979008324285197, "step": 8260}, {"loss": 0.9162, "grad_norm": 0.8946306109428406, "learning_rate": 0.0002, "epoch": 5.986246833152371, "step": 8270}, {"loss": 1.0439, "grad_norm": 1.34098219871521, "learning_rate": 0.0002, "epoch": 5.993485342019544, "step": 8280}, {"eval_loss": 1.4714229106903076, "eval_runtime": 26.301, "eval_samples_per_second": 16.577, "eval_steps_per_second": 2.091, "epoch": 6.0, "step": 8289}, {"loss": 1.1403, "grad_norm": 0.9737564325332642, "learning_rate": 0.0002, "epoch": 6.000723850886717, "step": 8290}, {"loss": 0.8875, "grad_norm": 1.2205945253372192, "learning_rate": 0.0002, "epoch": 6.007962359753891, "step": 8300}, {"loss": 0.8623, "grad_norm": 1.3529434204101562, "learning_rate": 0.0002, "epoch": 6.015200868621064, "step": 8310}, {"loss": 0.9427, "grad_norm": 1.2300174236297607, "learning_rate": 0.0002, "epoch": 6.022439377488237, "step": 8320}, {"loss": 0.9322, "grad_norm": 0.9248194098472595, "learning_rate": 0.0002, "epoch": 6.029677886355411, "step": 8330}, {"loss": 0.9302, "grad_norm": 1.1140035390853882, "learning_rate": 0.0002, "epoch": 6.036916395222584, "step": 8340}, {"loss": 0.8255, "grad_norm": 1.2097352743148804, "learning_rate": 0.0002, "epoch": 6.0441549040897575, "step": 8350}, {"loss": 0.8792, "grad_norm": 0.9472483396530151, "learning_rate": 0.0002, "epoch": 6.051393412956931, "step": 8360}, {"loss": 0.8865, "grad_norm": 1.0195368528366089, "learning_rate": 0.0002, "epoch": 6.058631921824104, "step": 8370}, {"loss": 0.8858, "grad_norm": 1.182735562324524, "learning_rate": 0.0002, "epoch": 6.065870430691278, "step": 8380}, {"loss": 0.9455, "grad_norm": 1.1042858362197876, "learning_rate": 0.0002, "epoch": 6.073108939558451, "step": 8390}, {"loss": 0.9723, "grad_norm": 0.8606401085853577, "learning_rate": 0.0002, "epoch": 6.080347448425624, "step": 8400}, {"loss": 0.8436, "grad_norm": 1.1015676259994507, "learning_rate": 0.0002, "epoch": 6.087585957292798, "step": 8410}, {"loss": 0.8845, "grad_norm": 1.690224289894104, "learning_rate": 0.0002, "epoch": 6.094824466159971, "step": 8420}, {"loss": 0.8484, "grad_norm": 1.1928749084472656, "learning_rate": 0.0002, "epoch": 6.1020629750271445, "step": 8430}, {"loss": 0.9546, "grad_norm": 1.0816864967346191, "learning_rate": 0.0002, "epoch": 6.109301483894318, "step": 8440}, {"loss": 0.8286, "grad_norm": 1.1638226509094238, "learning_rate": 0.0002, "epoch": 6.116539992761491, "step": 8450}, {"loss": 0.8749, "grad_norm": 1.3782968521118164, "learning_rate": 0.0002, "epoch": 6.1237785016286646, "step": 8460}, {"loss": 0.7956, "grad_norm": 1.2030094861984253, "learning_rate": 0.0002, "epoch": 6.131017010495838, "step": 8470}, {"loss": 0.8393, "grad_norm": 1.3227659463882446, "learning_rate": 0.0002, "epoch": 6.138255519363011, "step": 8480}, {"loss": 0.9175, "grad_norm": 1.104384422302246, "learning_rate": 0.0002, "epoch": 6.145494028230185, "step": 8490}, {"loss": 0.861, "grad_norm": 1.518805980682373, "learning_rate": 0.0002, "epoch": 6.152732537097358, "step": 8500}, {"loss": 0.9169, "grad_norm": 1.2029093503952026, "learning_rate": 0.0002, "epoch": 6.159971045964531, "step": 8510}, {"loss": 0.8701, "grad_norm": 1.2991217374801636, "learning_rate": 0.0002, "epoch": 6.167209554831705, "step": 8520}, {"loss": 0.9748, "grad_norm": 1.7002956867218018, "learning_rate": 0.0002, "epoch": 6.174448063698878, "step": 8530}, {"loss": 0.8881, "grad_norm": 1.6653581857681274, "learning_rate": 0.0002, "epoch": 6.1816865725660515, "step": 8540}, {"loss": 0.817, "grad_norm": 1.0493303537368774, "learning_rate": 0.0002, "epoch": 6.188925081433225, "step": 8550}, {"loss": 0.8726, "grad_norm": 1.539345622062683, "learning_rate": 0.0002, "epoch": 6.196163590300398, "step": 8560}, {"loss": 0.9452, "grad_norm": 1.2757070064544678, "learning_rate": 0.0002, "epoch": 6.203402099167572, "step": 8570}, {"loss": 0.8773, "grad_norm": 1.2416890859603882, "learning_rate": 0.0002, "epoch": 6.210640608034745, "step": 8580}, {"loss": 0.815, "grad_norm": 1.617621898651123, "learning_rate": 0.0002, "epoch": 6.217879116901918, "step": 8590}, {"loss": 0.9137, "grad_norm": 1.058962106704712, "learning_rate": 0.0002, "epoch": 6.225117625769092, "step": 8600}, {"loss": 0.8164, "grad_norm": 1.1489088535308838, "learning_rate": 0.0002, "epoch": 6.232356134636265, "step": 8610}, {"loss": 0.9476, "grad_norm": 0.9391577243804932, "learning_rate": 0.0002, "epoch": 6.239594643503438, "step": 8620}, {"loss": 0.932, "grad_norm": 1.363706111907959, "learning_rate": 0.0002, "epoch": 6.246833152370612, "step": 8630}, {"loss": 0.8917, "grad_norm": 0.779502809047699, "learning_rate": 0.0002, "epoch": 6.254071661237785, "step": 8640}, {"loss": 0.9196, "grad_norm": 2.000821590423584, "learning_rate": 0.0002, "epoch": 6.2613101701049585, "step": 8650}, {"loss": 0.9794, "grad_norm": 1.1521023511886597, "learning_rate": 0.0002, "epoch": 6.268548678972132, "step": 8660}, {"loss": 0.9147, "grad_norm": 1.3734570741653442, "learning_rate": 0.0002, "epoch": 6.275787187839305, "step": 8670}, {"loss": 0.795, "grad_norm": 0.9550670385360718, "learning_rate": 0.0002, "epoch": 6.283025696706479, "step": 8680}, {"loss": 0.9049, "grad_norm": 0.8937032222747803, "learning_rate": 0.0002, "epoch": 6.290264205573652, "step": 8690}, {"loss": 0.8526, "grad_norm": 1.3352779150009155, "learning_rate": 0.0002, "epoch": 6.297502714440825, "step": 8700}, {"loss": 0.8572, "grad_norm": 1.3057222366333008, "learning_rate": 0.0002, "epoch": 6.304741223307999, "step": 8710}, {"loss": 0.8825, "grad_norm": 0.9078314304351807, "learning_rate": 0.0002, "epoch": 6.311979732175172, "step": 8720}, {"loss": 0.8666, "grad_norm": 1.6663457155227661, "learning_rate": 0.0002, "epoch": 6.319218241042345, "step": 8730}, {"loss": 0.927, "grad_norm": 1.2043739557266235, "learning_rate": 0.0002, "epoch": 6.326456749909519, "step": 8740}, {"loss": 0.8014, "grad_norm": 0.9165967702865601, "learning_rate": 0.0002, "epoch": 6.333695258776692, "step": 8750}, {"loss": 0.9761, "grad_norm": 1.016452670097351, "learning_rate": 0.0002, "epoch": 6.3409337676438655, "step": 8760}, {"loss": 1.022, "grad_norm": 1.2209261655807495, "learning_rate": 0.0002, "epoch": 6.348172276511039, "step": 8770}, {"loss": 0.8012, "grad_norm": 1.3380663394927979, "learning_rate": 0.0002, "epoch": 6.355410785378212, "step": 8780}, {"loss": 0.9553, "grad_norm": 2.3311562538146973, "learning_rate": 0.0002, "epoch": 6.362649294245386, "step": 8790}, {"loss": 0.8676, "grad_norm": 1.0330604314804077, "learning_rate": 0.0002, "epoch": 6.369887803112559, "step": 8800}, {"loss": 0.98, "grad_norm": 0.9655511975288391, "learning_rate": 0.0002, "epoch": 6.377126311979732, "step": 8810}, {"loss": 1.0324, "grad_norm": 1.1065765619277954, "learning_rate": 0.0002, "epoch": 6.384364820846906, "step": 8820}, {"loss": 1.0078, "grad_norm": 1.2631285190582275, "learning_rate": 0.0002, "epoch": 6.391603329714079, "step": 8830}, {"loss": 0.8989, "grad_norm": 0.92459636926651, "learning_rate": 0.0002, "epoch": 6.398841838581252, "step": 8840}, {"loss": 0.8536, "grad_norm": 0.9982633590698242, "learning_rate": 0.0002, "epoch": 6.406080347448426, "step": 8850}, {"loss": 0.8949, "grad_norm": 1.0746768712997437, "learning_rate": 0.0002, "epoch": 6.413318856315599, "step": 8860}, {"loss": 0.8547, "grad_norm": 1.3024073839187622, "learning_rate": 0.0002, "epoch": 6.4205573651827725, "step": 8870}, {"loss": 0.9618, "grad_norm": 1.2764527797698975, "learning_rate": 0.0002, "epoch": 6.427795874049946, "step": 8880}, {"loss": 0.8905, "grad_norm": 0.8318809270858765, "learning_rate": 0.0002, "epoch": 6.435034382917119, "step": 8890}, {"loss": 0.917, "grad_norm": 1.7350783348083496, "learning_rate": 0.0002, "epoch": 6.442272891784293, "step": 8900}, {"loss": 1.0229, "grad_norm": 1.3430488109588623, "learning_rate": 0.0002, "epoch": 6.449511400651466, "step": 8910}, {"loss": 0.9678, "grad_norm": 1.5907495021820068, "learning_rate": 0.0002, "epoch": 6.456749909518639, "step": 8920}, {"loss": 0.9639, "grad_norm": 1.8579202890396118, "learning_rate": 0.0002, "epoch": 6.463988418385813, "step": 8930}, {"loss": 0.9302, "grad_norm": 1.2233413457870483, "learning_rate": 0.0002, "epoch": 6.471226927252986, "step": 8940}, {"loss": 0.9169, "grad_norm": 1.009103775024414, "learning_rate": 0.0002, "epoch": 6.4784654361201595, "step": 8950}, {"loss": 0.8969, "grad_norm": 1.1265181303024292, "learning_rate": 0.0002, "epoch": 6.485703944987333, "step": 8960}, {"loss": 0.8374, "grad_norm": 1.1733338832855225, "learning_rate": 0.0002, "epoch": 6.492942453854506, "step": 8970}, {"loss": 0.8764, "grad_norm": 1.0444518327713013, "learning_rate": 0.0002, "epoch": 6.50018096272168, "step": 8980}, {"loss": 0.9582, "grad_norm": 1.2296479940414429, "learning_rate": 0.0002, "epoch": 6.507419471588853, "step": 8990}, {"loss": 0.8557, "grad_norm": 1.370417833328247, "learning_rate": 0.0002, "epoch": 6.514657980456026, "step": 9000}, {"loss": 0.9787, "grad_norm": 1.4787620306015015, "learning_rate": 0.0002, "epoch": 6.5218964893232, "step": 9010}, {"loss": 0.967, "grad_norm": 0.8550514578819275, "learning_rate": 0.0002, "epoch": 6.529134998190373, "step": 9020}, {"loss": 0.9755, "grad_norm": 1.2327991724014282, "learning_rate": 0.0002, "epoch": 6.536373507057546, "step": 9030}, {"loss": 0.9248, "grad_norm": 1.0915621519088745, "learning_rate": 0.0002, "epoch": 6.54361201592472, "step": 9040}, {"loss": 1.0024, "grad_norm": 1.7243309020996094, "learning_rate": 0.0002, "epoch": 6.550850524791893, "step": 9050}, {"loss": 1.0123, "grad_norm": 0.954359769821167, "learning_rate": 0.0002, "epoch": 6.5580890336590665, "step": 9060}, {"loss": 0.8261, "grad_norm": 1.066051959991455, "learning_rate": 0.0002, "epoch": 6.56532754252624, "step": 9070}, {"loss": 0.944, "grad_norm": 1.200271487236023, "learning_rate": 0.0002, "epoch": 6.572566051393413, "step": 9080}, {"loss": 0.9788, "grad_norm": 1.4331457614898682, "learning_rate": 0.0002, "epoch": 6.579804560260587, "step": 9090}, {"loss": 1.0216, "grad_norm": 1.0892444849014282, "learning_rate": 0.0002, "epoch": 6.58704306912776, "step": 9100}, {"loss": 0.8557, "grad_norm": 1.849726915359497, "learning_rate": 0.0002, "epoch": 6.594281577994933, "step": 9110}, {"loss": 0.9495, "grad_norm": 1.1228708028793335, "learning_rate": 0.0002, "epoch": 6.601520086862107, "step": 9120}, {"loss": 1.0169, "grad_norm": 1.0928595066070557, "learning_rate": 0.0002, "epoch": 6.60875859572928, "step": 9130}, {"loss": 0.9342, "grad_norm": 1.2138155698776245, "learning_rate": 0.0002, "epoch": 6.615997104596453, "step": 9140}, {"loss": 0.8715, "grad_norm": 1.5155235528945923, "learning_rate": 0.0002, "epoch": 6.623235613463627, "step": 9150}, {"loss": 0.9806, "grad_norm": 1.3194212913513184, "learning_rate": 0.0002, "epoch": 6.6304741223308, "step": 9160}, {"loss": 0.8958, "grad_norm": 1.045623779296875, "learning_rate": 0.0002, "epoch": 6.6377126311979735, "step": 9170}, {"loss": 0.8698, "grad_norm": 0.9647570252418518, "learning_rate": 0.0002, "epoch": 6.644951140065147, "step": 9180}, {"loss": 0.8829, "grad_norm": 1.0818220376968384, "learning_rate": 0.0002, "epoch": 6.65218964893232, "step": 9190}, {"loss": 0.9745, "grad_norm": 1.2792822122573853, "learning_rate": 0.0002, "epoch": 6.659428157799494, "step": 9200}, {"loss": 0.8854, "grad_norm": 1.2764191627502441, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9210}, {"loss": 0.9709, "grad_norm": 1.0552066564559937, "learning_rate": 0.0002, "epoch": 6.67390517553384, "step": 9220}, {"loss": 0.8855, "grad_norm": 1.082476019859314, "learning_rate": 0.0002, "epoch": 6.681143684401014, "step": 9230}, {"loss": 0.9779, "grad_norm": 1.3313323259353638, "learning_rate": 0.0002, "epoch": 6.688382193268187, "step": 9240}, {"loss": 1.005, "grad_norm": 1.130048394203186, "learning_rate": 0.0002, "epoch": 6.69562070213536, "step": 9250}, {"loss": 0.9969, "grad_norm": 1.1997296810150146, "learning_rate": 0.0002, "epoch": 6.702859211002534, "step": 9260}, {"loss": 0.8691, "grad_norm": 1.0591834783554077, "learning_rate": 0.0002, "epoch": 6.710097719869707, "step": 9270}, {"loss": 0.9603, "grad_norm": 1.2722901105880737, "learning_rate": 0.0002, "epoch": 6.7173362287368805, "step": 9280}, {"loss": 0.9227, "grad_norm": 1.1150950193405151, "learning_rate": 0.0002, "epoch": 6.724574737604054, "step": 9290}, {"loss": 0.95, "grad_norm": 1.1575992107391357, "learning_rate": 0.0002, "epoch": 6.731813246471227, "step": 9300}, {"loss": 0.9822, "grad_norm": 0.9371691346168518, "learning_rate": 0.0002, "epoch": 6.739051755338401, "step": 9310}, {"loss": 0.9773, "grad_norm": 1.4924226999282837, "learning_rate": 0.0002, "epoch": 6.746290264205574, "step": 9320}, {"loss": 0.969, "grad_norm": 1.1524218320846558, "learning_rate": 0.0002, "epoch": 6.753528773072747, "step": 9330}, {"loss": 0.9271, "grad_norm": 0.9500471949577332, "learning_rate": 0.0002, "epoch": 6.760767281939921, "step": 9340}, {"loss": 0.9029, "grad_norm": 1.2062290906906128, "learning_rate": 0.0002, "epoch": 6.768005790807094, "step": 9350}, {"loss": 0.9121, "grad_norm": 1.212631106376648, "learning_rate": 0.0002, "epoch": 6.7752442996742674, "step": 9360}, {"loss": 0.8486, "grad_norm": 1.9135472774505615, "learning_rate": 0.0002, "epoch": 6.782482808541441, "step": 9370}, {"loss": 0.9332, "grad_norm": 0.9682775139808655, "learning_rate": 0.0002, "epoch": 6.789721317408614, "step": 9380}, {"loss": 0.8548, "grad_norm": 1.1405237913131714, "learning_rate": 0.0002, "epoch": 6.7969598262757875, "step": 9390}, {"loss": 0.8922, "grad_norm": 1.6855751276016235, "learning_rate": 0.0002, "epoch": 6.804198335142961, "step": 9400}, {"loss": 0.9417, "grad_norm": 1.6590169668197632, "learning_rate": 0.0002, "epoch": 6.811436844010134, "step": 9410}, {"loss": 0.868, "grad_norm": 1.8795170783996582, "learning_rate": 0.0002, "epoch": 6.818675352877308, "step": 9420}, {"loss": 0.9142, "grad_norm": 1.1087183952331543, "learning_rate": 0.0002, "epoch": 6.825913861744481, "step": 9430}, {"loss": 1.1427, "grad_norm": 1.4178446531295776, "learning_rate": 0.0002, "epoch": 6.833152370611654, "step": 9440}, {"loss": 0.8325, "grad_norm": 1.0792350769042969, "learning_rate": 0.0002, "epoch": 6.840390879478828, "step": 9450}, {"loss": 1.0078, "grad_norm": 1.2159196138381958, "learning_rate": 0.0002, "epoch": 6.847629388346001, "step": 9460}, {"loss": 0.9536, "grad_norm": 0.9998821020126343, "learning_rate": 0.0002, "epoch": 6.8548678972131745, "step": 9470}, {"loss": 0.9277, "grad_norm": 0.7940687537193298, "learning_rate": 0.0002, "epoch": 6.862106406080348, "step": 9480}, {"loss": 0.8612, "grad_norm": 0.9572826027870178, "learning_rate": 0.0002, "epoch": 6.869344914947521, "step": 9490}, {"loss": 0.9611, "grad_norm": 1.1086537837982178, "learning_rate": 0.0002, "epoch": 6.876583423814694, "step": 9500}, {"loss": 0.9276, "grad_norm": 1.1934887170791626, "learning_rate": 0.0002, "epoch": 6.883821932681867, "step": 9510}, {"loss": 0.8416, "grad_norm": 1.207324504852295, "learning_rate": 0.0002, "epoch": 6.89106044154904, "step": 9520}, {"loss": 0.9378, "grad_norm": 1.1303677558898926, "learning_rate": 0.0002, "epoch": 6.898298950416214, "step": 9530}, {"loss": 0.9599, "grad_norm": 1.4958926439285278, "learning_rate": 0.0002, "epoch": 6.905537459283387, "step": 9540}, {"loss": 0.9365, "grad_norm": 1.2141553163528442, "learning_rate": 0.0002, "epoch": 6.9127759681505605, "step": 9550}, {"loss": 1.0291, "grad_norm": 1.6544346809387207, "learning_rate": 0.0002, "epoch": 6.920014477017734, "step": 9560}, {"loss": 0.8439, "grad_norm": 1.0540320873260498, "learning_rate": 0.0002, "epoch": 6.927252985884907, "step": 9570}, {"loss": 0.9831, "grad_norm": 1.3095581531524658, "learning_rate": 0.0002, "epoch": 6.934491494752081, "step": 9580}, {"loss": 0.8694, "grad_norm": 1.4509341716766357, "learning_rate": 0.0002, "epoch": 6.941730003619254, "step": 9590}, {"loss": 0.983, "grad_norm": 1.1091740131378174, "learning_rate": 0.0002, "epoch": 6.948968512486427, "step": 9600}, {"loss": 0.9126, "grad_norm": 1.102929949760437, "learning_rate": 0.0002, "epoch": 6.956207021353601, "step": 9610}, {"loss": 0.9622, "grad_norm": 1.1377743482589722, "learning_rate": 0.0002, "epoch": 6.963445530220774, "step": 9620}, {"loss": 0.9045, "grad_norm": 1.2070361375808716, "learning_rate": 0.0002, "epoch": 6.970684039087947, "step": 9630}, {"loss": 0.9714, "grad_norm": 1.30153489112854, "learning_rate": 0.0002, "epoch": 6.977922547955121, "step": 9640}, {"loss": 0.9555, "grad_norm": 1.4641543626785278, "learning_rate": 0.0002, "epoch": 6.985161056822294, "step": 9650}, {"loss": 0.9177, "grad_norm": 1.0497819185256958, "learning_rate": 0.0002, "epoch": 6.9923995656894675, "step": 9660}, {"loss": 0.8369, "grad_norm": 1.2500354051589966, "learning_rate": 0.0002, "epoch": 6.999638074556641, "step": 9670}]} +{"epoch": 7.997104596453131, "step": 11048, "epoch_duration": 1212.7061741352081, "total_accumulated_duration": 10323.10546040535, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15079.2998046875}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-4144", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 4.7061, "grad_norm": 1.2523442506790161, "learning_rate": 0.0002, "epoch": 0.007238508867173362, "step": 10}, {"loss": 3.3493, "grad_norm": 1.8887330293655396, "learning_rate": 0.0002, "epoch": 0.014477017734346724, "step": 20}, {"loss": 2.7585, "grad_norm": 0.9668035507202148, "learning_rate": 0.0002, "epoch": 0.021715526601520086, "step": 30}, {"loss": 2.3699, "grad_norm": 2.9167306423187256, "learning_rate": 0.0002, "epoch": 0.028954035468693448, "step": 40}, {"loss": 2.2679, "grad_norm": 2.649867296218872, "learning_rate": 0.0002, "epoch": 0.036192544335866814, "step": 50}, {"loss": 2.2202, "grad_norm": 1.5120655298233032, "learning_rate": 0.0002, "epoch": 0.04343105320304017, "step": 60}, {"loss": 2.2026, "grad_norm": 0.7879868149757385, "learning_rate": 0.0002, "epoch": 0.05066956207021354, "step": 70}, {"loss": 1.9447, "grad_norm": 0.7616953253746033, "learning_rate": 0.0002, "epoch": 0.057908070937386896, "step": 80}, {"loss": 2.0112, "grad_norm": 1.8809149265289307, "learning_rate": 0.0002, "epoch": 0.06514657980456026, "step": 90}, {"loss": 1.8337, "grad_norm": 0.9294016361236572, "learning_rate": 0.0002, "epoch": 0.07238508867173363, "step": 100}, {"loss": 1.8419, "grad_norm": 0.7145281434059143, "learning_rate": 0.0002, "epoch": 0.07962359753890698, "step": 110}, {"loss": 2.0036, "grad_norm": 0.7564446330070496, "learning_rate": 0.0002, "epoch": 0.08686210640608034, "step": 120}, {"loss": 1.9306, "grad_norm": 1.1681925058364868, "learning_rate": 0.0002, "epoch": 0.09410061527325371, "step": 130}, {"loss": 1.7875, "grad_norm": 0.6708641648292542, "learning_rate": 0.0002, "epoch": 0.10133912414042708, "step": 140}, {"loss": 1.786, "grad_norm": 0.7625647783279419, "learning_rate": 0.0002, "epoch": 0.10857763300760044, "step": 150}, {"loss": 1.6687, "grad_norm": 0.8463464975357056, "learning_rate": 0.0002, "epoch": 0.11581614187477379, "step": 160}, {"loss": 1.6214, "grad_norm": 0.7502335906028748, "learning_rate": 0.0002, "epoch": 0.12305465074194716, "step": 170}, {"loss": 1.7433, "grad_norm": 0.6929958462715149, "learning_rate": 0.0002, "epoch": 0.13029315960912052, "step": 180}, {"loss": 1.6009, "grad_norm": 0.6798707842826843, "learning_rate": 0.0002, "epoch": 0.1375316684762939, "step": 190}, {"loss": 1.6208, "grad_norm": 0.7566508650779724, "learning_rate": 0.0002, "epoch": 0.14477017734346725, "step": 200}, {"loss": 1.5823, "grad_norm": 0.7196869850158691, "learning_rate": 0.0002, "epoch": 0.15200868621064062, "step": 210}, {"loss": 1.738, "grad_norm": 0.8401045799255371, "learning_rate": 0.0002, "epoch": 0.15924719507781396, "step": 220}, {"loss": 1.7574, "grad_norm": 0.8503773212432861, "learning_rate": 0.0002, "epoch": 0.16648570394498732, "step": 230}, {"loss": 1.7861, "grad_norm": 0.7183733582496643, "learning_rate": 0.0002, "epoch": 0.1737242128121607, "step": 240}, {"loss": 1.6693, "grad_norm": 0.7082605957984924, "learning_rate": 0.0002, "epoch": 0.18096272167933405, "step": 250}, {"loss": 1.619, "grad_norm": 0.9386326670646667, "learning_rate": 0.0002, "epoch": 0.18820123054650742, "step": 260}, {"loss": 1.6511, "grad_norm": 0.7332451939582825, "learning_rate": 0.0002, "epoch": 0.19543973941368079, "step": 270}, {"loss": 1.6353, "grad_norm": 0.7092869877815247, "learning_rate": 0.0002, "epoch": 0.20267824828085415, "step": 280}, {"loss": 1.5996, "grad_norm": 0.7256413698196411, "learning_rate": 0.0002, "epoch": 0.20991675714802752, "step": 290}, {"loss": 1.6754, "grad_norm": 0.6398681402206421, "learning_rate": 0.0002, "epoch": 0.21715526601520088, "step": 300}, {"loss": 1.397, "grad_norm": 0.6273287534713745, "learning_rate": 0.0002, "epoch": 0.22439377488237422, "step": 310}, {"loss": 1.5115, "grad_norm": 0.511648416519165, "learning_rate": 0.0002, "epoch": 0.23163228374954759, "step": 320}, {"loss": 1.5424, "grad_norm": 0.8677352070808411, "learning_rate": 0.0002, "epoch": 0.23887079261672095, "step": 330}, {"loss": 1.6779, "grad_norm": 0.6270743012428284, "learning_rate": 0.0002, "epoch": 0.24610930148389432, "step": 340}, {"loss": 1.626, "grad_norm": 0.7980281114578247, "learning_rate": 0.0002, "epoch": 0.2533478103510677, "step": 350}, {"loss": 1.5238, "grad_norm": 0.632486879825592, "learning_rate": 0.0002, "epoch": 0.26058631921824105, "step": 360}, {"loss": 1.5175, "grad_norm": 0.6527034640312195, "learning_rate": 0.0002, "epoch": 0.2678248280854144, "step": 370}, {"loss": 1.627, "grad_norm": 0.7672118544578552, "learning_rate": 0.0002, "epoch": 0.2750633369525878, "step": 380}, {"loss": 1.5605, "grad_norm": 0.6035117506980896, "learning_rate": 0.0002, "epoch": 0.28230184581976114, "step": 390}, {"loss": 1.4603, "grad_norm": 0.5955103039741516, "learning_rate": 0.0002, "epoch": 0.2895403546869345, "step": 400}, {"loss": 1.558, "grad_norm": 0.6015191674232483, "learning_rate": 0.0002, "epoch": 0.2967788635541079, "step": 410}, {"loss": 1.6091, "grad_norm": 0.6380982398986816, "learning_rate": 0.0002, "epoch": 0.30401737242128124, "step": 420}, {"loss": 1.5292, "grad_norm": 0.6707863211631775, "learning_rate": 0.0002, "epoch": 0.3112558812884546, "step": 430}, {"loss": 1.4426, "grad_norm": 0.7010176777839661, "learning_rate": 0.0002, "epoch": 0.3184943901556279, "step": 440}, {"loss": 1.5572, "grad_norm": 0.8263739943504333, "learning_rate": 0.0002, "epoch": 0.3257328990228013, "step": 450}, {"loss": 1.5188, "grad_norm": 0.7253276109695435, "learning_rate": 0.0002, "epoch": 0.33297140788997465, "step": 460}, {"loss": 1.584, "grad_norm": 0.5238934755325317, "learning_rate": 0.0002, "epoch": 0.340209916757148, "step": 470}, {"loss": 1.7035, "grad_norm": 0.7869495749473572, "learning_rate": 0.0002, "epoch": 0.3474484256243214, "step": 480}, {"loss": 1.5776, "grad_norm": 0.7485215663909912, "learning_rate": 0.0002, "epoch": 0.35468693449149474, "step": 490}, {"loss": 1.6274, "grad_norm": 0.5413193106651306, "learning_rate": 0.0002, "epoch": 0.3619254433586681, "step": 500}, {"loss": 1.7323, "grad_norm": 0.7615048885345459, "learning_rate": 0.0002, "epoch": 0.3691639522258415, "step": 510}, {"loss": 1.532, "grad_norm": 0.7685340046882629, "learning_rate": 0.0002, "epoch": 0.37640246109301484, "step": 520}, {"loss": 1.6312, "grad_norm": 0.6379081010818481, "learning_rate": 0.0002, "epoch": 0.3836409699601882, "step": 530}, {"loss": 1.5645, "grad_norm": 0.7946939468383789, "learning_rate": 0.0002, "epoch": 0.39087947882736157, "step": 540}, {"loss": 1.4001, "grad_norm": 0.6287278532981873, "learning_rate": 0.0002, "epoch": 0.39811798769453494, "step": 550}, {"loss": 1.5982, "grad_norm": 0.6811642646789551, "learning_rate": 0.0002, "epoch": 0.4053564965617083, "step": 560}, {"loss": 1.4953, "grad_norm": 0.671073317527771, "learning_rate": 0.0002, "epoch": 0.41259500542888167, "step": 570}, {"loss": 1.6753, "grad_norm": 0.6313900351524353, "learning_rate": 0.0002, "epoch": 0.41983351429605503, "step": 580}, {"loss": 1.546, "grad_norm": 0.5291772484779358, "learning_rate": 0.0002, "epoch": 0.4270720231632284, "step": 590}, {"loss": 1.5441, "grad_norm": 0.62503582239151, "learning_rate": 0.0002, "epoch": 0.43431053203040176, "step": 600}, {"loss": 1.6276, "grad_norm": 0.5777305364608765, "learning_rate": 0.0002, "epoch": 0.4415490408975751, "step": 610}, {"loss": 1.4758, "grad_norm": 0.7013497352600098, "learning_rate": 0.0002, "epoch": 0.44878754976474844, "step": 620}, {"loss": 1.4029, "grad_norm": 0.8044822216033936, "learning_rate": 0.0002, "epoch": 0.4560260586319218, "step": 630}, {"loss": 1.7195, "grad_norm": 0.672531247138977, "learning_rate": 0.0002, "epoch": 0.46326456749909517, "step": 640}, {"loss": 1.614, "grad_norm": 0.6233910322189331, "learning_rate": 0.0002, "epoch": 0.47050307636626854, "step": 650}, {"loss": 1.6041, "grad_norm": 0.651524543762207, "learning_rate": 0.0002, "epoch": 0.4777415852334419, "step": 660}, {"loss": 1.5842, "grad_norm": 0.7213939428329468, "learning_rate": 0.0002, "epoch": 0.48498009410061527, "step": 670}, {"loss": 1.5453, "grad_norm": 0.6541454792022705, "learning_rate": 0.0002, "epoch": 0.49221860296778863, "step": 680}, {"loss": 1.662, "grad_norm": 0.6568936109542847, "learning_rate": 0.0002, "epoch": 0.499457111834962, "step": 690}, {"loss": 1.624, "grad_norm": 0.7176415324211121, "learning_rate": 0.0002, "epoch": 0.5066956207021354, "step": 700}, {"loss": 1.6099, "grad_norm": 0.6553855538368225, "learning_rate": 0.0002, "epoch": 0.5139341295693087, "step": 710}, {"loss": 1.5508, "grad_norm": 0.5654335618019104, "learning_rate": 0.0002, "epoch": 0.5211726384364821, "step": 720}, {"loss": 1.392, "grad_norm": 0.5671001672744751, "learning_rate": 0.0002, "epoch": 0.5284111473036555, "step": 730}, {"loss": 1.388, "grad_norm": 0.7914412021636963, "learning_rate": 0.0002, "epoch": 0.5356496561708288, "step": 740}, {"loss": 1.5931, "grad_norm": 0.6172138452529907, "learning_rate": 0.0002, "epoch": 0.5428881650380022, "step": 750}, {"loss": 1.4018, "grad_norm": 0.6132623553276062, "learning_rate": 0.0002, "epoch": 0.5501266739051756, "step": 760}, {"loss": 1.513, "grad_norm": 0.654000461101532, "learning_rate": 0.0002, "epoch": 0.5573651827723489, "step": 770}, {"loss": 1.5035, "grad_norm": 0.5691370964050293, "learning_rate": 0.0002, "epoch": 0.5646036916395223, "step": 780}, {"loss": 1.65, "grad_norm": 0.7922580242156982, "learning_rate": 0.0002, "epoch": 0.5718422005066957, "step": 790}, {"loss": 1.4521, "grad_norm": 0.6831880211830139, "learning_rate": 0.0002, "epoch": 0.579080709373869, "step": 800}, {"loss": 1.4734, "grad_norm": 0.6740124821662903, "learning_rate": 0.0002, "epoch": 0.5863192182410424, "step": 810}, {"loss": 1.6498, "grad_norm": 1.380016803741455, "learning_rate": 0.0002, "epoch": 0.5935577271082157, "step": 820}, {"loss": 1.4642, "grad_norm": 0.6552878022193909, "learning_rate": 0.0002, "epoch": 0.6007962359753891, "step": 830}, {"loss": 1.6271, "grad_norm": 0.6649535298347473, "learning_rate": 0.0002, "epoch": 0.6080347448425625, "step": 840}, {"loss": 1.5886, "grad_norm": 0.561738133430481, "learning_rate": 0.0002, "epoch": 0.6152732537097358, "step": 850}, {"loss": 1.5364, "grad_norm": 0.6133047938346863, "learning_rate": 0.0002, "epoch": 0.6225117625769092, "step": 860}, {"loss": 1.3489, "grad_norm": 0.559843122959137, "learning_rate": 0.0002, "epoch": 0.6297502714440825, "step": 870}, {"loss": 1.4878, "grad_norm": 0.6117811799049377, "learning_rate": 0.0002, "epoch": 0.6369887803112558, "step": 880}, {"loss": 1.56, "grad_norm": 0.6209776401519775, "learning_rate": 0.0002, "epoch": 0.6442272891784292, "step": 890}, {"loss": 1.6747, "grad_norm": 0.6234082579612732, "learning_rate": 0.0002, "epoch": 0.6514657980456026, "step": 900}, {"loss": 1.6963, "grad_norm": 0.7623258233070374, "learning_rate": 0.0002, "epoch": 0.6587043069127759, "step": 910}, {"loss": 1.2424, "grad_norm": 0.6148061752319336, "learning_rate": 0.0002, "epoch": 0.6659428157799493, "step": 920}, {"loss": 1.4319, "grad_norm": 0.6682973504066467, "learning_rate": 0.0002, "epoch": 0.6731813246471227, "step": 930}, {"loss": 1.5377, "grad_norm": 0.5513041615486145, "learning_rate": 0.0002, "epoch": 0.680419833514296, "step": 940}, {"loss": 1.3991, "grad_norm": 0.5197525024414062, "learning_rate": 0.0002, "epoch": 0.6876583423814694, "step": 950}, {"loss": 1.4398, "grad_norm": 0.6490758061408997, "learning_rate": 0.0002, "epoch": 0.6948968512486428, "step": 960}, {"loss": 1.5251, "grad_norm": 0.6450682878494263, "learning_rate": 0.0002, "epoch": 0.7021353601158161, "step": 970}, {"loss": 1.5417, "grad_norm": 0.6203766465187073, "learning_rate": 0.0002, "epoch": 0.7093738689829895, "step": 980}, {"loss": 1.4575, "grad_norm": 0.6023609638214111, "learning_rate": 0.0002, "epoch": 0.7166123778501629, "step": 990}, {"loss": 1.4973, "grad_norm": 0.5765255093574524, "learning_rate": 0.0002, "epoch": 0.7238508867173362, "step": 1000}, {"loss": 1.483, "grad_norm": 0.6650075316429138, "learning_rate": 0.0002, "epoch": 0.7310893955845096, "step": 1010}, {"loss": 1.5959, "grad_norm": 0.5610854029655457, "learning_rate": 0.0002, "epoch": 0.738327904451683, "step": 1020}, {"loss": 1.5248, "grad_norm": 0.7072813510894775, "learning_rate": 0.0002, "epoch": 0.7455664133188563, "step": 1030}, {"loss": 1.5776, "grad_norm": 0.6815407872200012, "learning_rate": 0.0002, "epoch": 0.7528049221860297, "step": 1040}, {"loss": 1.4577, "grad_norm": 0.7932390570640564, "learning_rate": 0.0002, "epoch": 0.760043431053203, "step": 1050}, {"loss": 1.4515, "grad_norm": 0.5798183083534241, "learning_rate": 0.0002, "epoch": 0.7672819399203764, "step": 1060}, {"loss": 1.5053, "grad_norm": 0.7898504137992859, "learning_rate": 0.0002, "epoch": 0.7745204487875498, "step": 1070}, {"loss": 1.4776, "grad_norm": 0.4983280301094055, "learning_rate": 0.0002, "epoch": 0.7817589576547231, "step": 1080}, {"loss": 1.5007, "grad_norm": 0.691403329372406, "learning_rate": 0.0002, "epoch": 0.7889974665218965, "step": 1090}, {"loss": 1.5153, "grad_norm": 0.5394481420516968, "learning_rate": 0.0002, "epoch": 0.7962359753890699, "step": 1100}, {"loss": 1.6892, "grad_norm": 0.5136822462081909, "learning_rate": 0.0002, "epoch": 0.8034744842562432, "step": 1110}, {"loss": 1.4902, "grad_norm": 0.6828126907348633, "learning_rate": 0.0002, "epoch": 0.8107129931234166, "step": 1120}, {"loss": 1.4346, "grad_norm": 0.6799656748771667, "learning_rate": 0.0002, "epoch": 0.81795150199059, "step": 1130}, {"loss": 1.2678, "grad_norm": 0.5428406000137329, "learning_rate": 0.0002, "epoch": 0.8251900108577633, "step": 1140}, {"loss": 1.4072, "grad_norm": 0.4811290502548218, "learning_rate": 0.0002, "epoch": 0.8324285197249367, "step": 1150}, {"loss": 1.4512, "grad_norm": 0.5519434809684753, "learning_rate": 0.0002, "epoch": 0.8396670285921101, "step": 1160}, {"loss": 1.4072, "grad_norm": 0.9748060703277588, "learning_rate": 0.0002, "epoch": 0.8469055374592834, "step": 1170}, {"loss": 1.4309, "grad_norm": 0.712609589099884, "learning_rate": 0.0002, "epoch": 0.8541440463264568, "step": 1180}, {"loss": 1.434, "grad_norm": 0.6866157054901123, "learning_rate": 0.0002, "epoch": 0.8613825551936302, "step": 1190}, {"loss": 1.3704, "grad_norm": 0.5068854093551636, "learning_rate": 0.0002, "epoch": 0.8686210640608035, "step": 1200}, {"loss": 1.5601, "grad_norm": 0.6333245038986206, "learning_rate": 0.0002, "epoch": 0.8758595729279768, "step": 1210}, {"loss": 1.4636, "grad_norm": 0.6424421072006226, "learning_rate": 0.0002, "epoch": 0.8830980817951501, "step": 1220}, {"loss": 1.4186, "grad_norm": 0.4771921932697296, "learning_rate": 0.0002, "epoch": 0.8903365906623235, "step": 1230}, {"loss": 1.6323, "grad_norm": 0.5191764235496521, "learning_rate": 0.0002, "epoch": 0.8975750995294969, "step": 1240}, {"loss": 1.6105, "grad_norm": 0.756222128868103, "learning_rate": 0.0002, "epoch": 0.9048136083966702, "step": 1250}, {"loss": 1.4396, "grad_norm": 0.623823881149292, "learning_rate": 0.0002, "epoch": 0.9120521172638436, "step": 1260}, {"loss": 1.3097, "grad_norm": 0.8166571259498596, "learning_rate": 0.0002, "epoch": 0.919290626131017, "step": 1270}, {"loss": 1.4625, "grad_norm": 0.6059346795082092, "learning_rate": 0.0002, "epoch": 0.9265291349981903, "step": 1280}, {"loss": 1.3555, "grad_norm": 0.5842690467834473, "learning_rate": 0.0002, "epoch": 0.9337676438653637, "step": 1290}, {"loss": 1.5859, "grad_norm": 0.7649800777435303, "learning_rate": 0.0002, "epoch": 0.9410061527325371, "step": 1300}, {"loss": 1.5915, "grad_norm": 0.6420919895172119, "learning_rate": 0.0002, "epoch": 0.9482446615997104, "step": 1310}, {"loss": 1.453, "grad_norm": 0.7011452913284302, "learning_rate": 0.0002, "epoch": 0.9554831704668838, "step": 1320}, {"loss": 1.6766, "grad_norm": 0.5783746242523193, "learning_rate": 0.0002, "epoch": 0.9627216793340572, "step": 1330}, {"loss": 1.6308, "grad_norm": 0.5973192453384399, "learning_rate": 0.0002, "epoch": 0.9699601882012305, "step": 1340}, {"loss": 1.5901, "grad_norm": 0.6181833744049072, "learning_rate": 0.0002, "epoch": 0.9771986970684039, "step": 1350}, {"loss": 1.5258, "grad_norm": 0.5563396215438843, "learning_rate": 0.0002, "epoch": 0.9844372059355773, "step": 1360}, {"loss": 1.4508, "grad_norm": 0.45723360776901245, "learning_rate": 0.0002, "epoch": 0.9916757148027506, "step": 1370}, {"loss": 1.3291, "grad_norm": 0.5947498679161072, "learning_rate": 0.0002, "epoch": 0.998914223669924, "step": 1380}, {"eval_loss": 1.480796456336975, "eval_runtime": 27.3103, "eval_samples_per_second": 15.965, "eval_steps_per_second": 2.014, "epoch": 0.9996380745566413, "step": 1381}, {"loss": 1.3057, "grad_norm": 0.5599952936172485, "learning_rate": 0.0002, "epoch": 1.0061527325370974, "step": 1390}, {"loss": 1.4991, "grad_norm": 0.5932008028030396, "learning_rate": 0.0002, "epoch": 1.0133912414042707, "step": 1400}, {"loss": 1.4506, "grad_norm": 0.6194121837615967, "learning_rate": 0.0002, "epoch": 1.020629750271444, "step": 1410}, {"loss": 1.5966, "grad_norm": 0.6995621919631958, "learning_rate": 0.0002, "epoch": 1.0278682591386175, "step": 1420}, {"loss": 1.4153, "grad_norm": 0.7905810475349426, "learning_rate": 0.0002, "epoch": 1.0351067680057908, "step": 1430}, {"loss": 1.4414, "grad_norm": 0.7221615314483643, "learning_rate": 0.0002, "epoch": 1.0423452768729642, "step": 1440}, {"loss": 1.3859, "grad_norm": 0.6170642375946045, "learning_rate": 0.0002, "epoch": 1.0495837857401376, "step": 1450}, {"loss": 1.3806, "grad_norm": 0.5844094753265381, "learning_rate": 0.0002, "epoch": 1.056822294607311, "step": 1460}, {"loss": 1.4871, "grad_norm": 0.7731822729110718, "learning_rate": 0.0002, "epoch": 1.0640608034744843, "step": 1470}, {"loss": 1.4286, "grad_norm": 0.4554748237133026, "learning_rate": 0.0002, "epoch": 1.0712993123416577, "step": 1480}, {"loss": 1.3977, "grad_norm": 0.6923259496688843, "learning_rate": 0.0002, "epoch": 1.078537821208831, "step": 1490}, {"loss": 1.3936, "grad_norm": 0.6008219122886658, "learning_rate": 0.0002, "epoch": 1.0857763300760044, "step": 1500}, {"loss": 1.4821, "grad_norm": 0.6450045704841614, "learning_rate": 0.0002, "epoch": 1.0930148389431777, "step": 1510}, {"loss": 1.3295, "grad_norm": 0.7833753824234009, "learning_rate": 0.0002, "epoch": 1.1002533478103511, "step": 1520}, {"loss": 1.3424, "grad_norm": 0.5076758861541748, "learning_rate": 0.0002, "epoch": 1.1074918566775245, "step": 1530}, {"loss": 1.4043, "grad_norm": 0.5661332011222839, "learning_rate": 0.0002, "epoch": 1.1147303655446978, "step": 1540}, {"loss": 1.4963, "grad_norm": 0.6526919603347778, "learning_rate": 0.0002, "epoch": 1.1219688744118712, "step": 1550}, {"loss": 1.3671, "grad_norm": 0.5613082647323608, "learning_rate": 0.0002, "epoch": 1.1292073832790446, "step": 1560}, {"loss": 1.4458, "grad_norm": 0.6113885641098022, "learning_rate": 0.0002, "epoch": 1.136445892146218, "step": 1570}, {"loss": 1.3552, "grad_norm": 0.6732510328292847, "learning_rate": 0.0002, "epoch": 1.1436844010133913, "step": 1580}, {"loss": 1.3114, "grad_norm": 0.6146392226219177, "learning_rate": 0.0002, "epoch": 1.1509229098805647, "step": 1590}, {"loss": 1.411, "grad_norm": 0.6766974329948425, "learning_rate": 0.0002, "epoch": 1.158161418747738, "step": 1600}, {"loss": 1.2401, "grad_norm": 0.7621957659721375, "learning_rate": 0.0002, "epoch": 1.1653999276149114, "step": 1610}, {"loss": 1.3758, "grad_norm": 0.6959581971168518, "learning_rate": 0.0002, "epoch": 1.1726384364820848, "step": 1620}, {"loss": 1.382, "grad_norm": 0.6691278219223022, "learning_rate": 0.0002, "epoch": 1.1798769453492581, "step": 1630}, {"loss": 1.4147, "grad_norm": 0.4927774965763092, "learning_rate": 0.0002, "epoch": 1.1871154542164315, "step": 1640}, {"loss": 1.449, "grad_norm": 0.7724234461784363, "learning_rate": 0.0002, "epoch": 1.1943539630836049, "step": 1650}, {"loss": 1.4778, "grad_norm": 0.6817787885665894, "learning_rate": 0.0002, "epoch": 1.2015924719507782, "step": 1660}, {"loss": 1.3776, "grad_norm": 0.6500699520111084, "learning_rate": 0.0002, "epoch": 1.2088309808179516, "step": 1670}, {"loss": 1.3875, "grad_norm": 0.5703568458557129, "learning_rate": 0.0002, "epoch": 1.216069489685125, "step": 1680}, {"loss": 1.4735, "grad_norm": 0.6261579990386963, "learning_rate": 0.0002, "epoch": 1.2233079985522983, "step": 1690}, {"loss": 1.3898, "grad_norm": 0.651713490486145, "learning_rate": 0.0002, "epoch": 1.2305465074194717, "step": 1700}, {"loss": 1.4002, "grad_norm": 0.684399425983429, "learning_rate": 0.0002, "epoch": 1.237785016286645, "step": 1710}, {"loss": 1.5027, "grad_norm": 0.6996857523918152, "learning_rate": 0.0002, "epoch": 1.2450235251538184, "step": 1720}, {"loss": 1.3326, "grad_norm": 0.7102537751197815, "learning_rate": 0.0002, "epoch": 1.2522620340209918, "step": 1730}, {"loss": 1.3675, "grad_norm": 0.45809897780418396, "learning_rate": 0.0002, "epoch": 1.2595005428881652, "step": 1740}, {"loss": 1.4175, "grad_norm": 0.6377046704292297, "learning_rate": 0.0002, "epoch": 1.2667390517553385, "step": 1750}, {"loss": 1.3479, "grad_norm": 0.6965704560279846, "learning_rate": 0.0002, "epoch": 1.2739775606225119, "step": 1760}, {"loss": 1.5647, "grad_norm": 0.5688214302062988, "learning_rate": 0.0002, "epoch": 1.2812160694896852, "step": 1770}, {"loss": 1.3967, "grad_norm": 0.6384190320968628, "learning_rate": 0.0002, "epoch": 1.2884545783568586, "step": 1780}, {"loss": 1.3671, "grad_norm": 0.5629363656044006, "learning_rate": 0.0002, "epoch": 1.295693087224032, "step": 1790}, {"loss": 1.2292, "grad_norm": 0.6148255467414856, "learning_rate": 0.0002, "epoch": 1.3029315960912053, "step": 1800}, {"loss": 1.5806, "grad_norm": 0.655580997467041, "learning_rate": 0.0002, "epoch": 1.3101701049583787, "step": 1810}, {"loss": 1.2398, "grad_norm": 0.5642657279968262, "learning_rate": 0.0002, "epoch": 1.3174086138255519, "step": 1820}, {"loss": 1.3246, "grad_norm": 0.59607994556427, "learning_rate": 0.0002, "epoch": 1.3246471226927252, "step": 1830}, {"loss": 1.3274, "grad_norm": 0.5564199090003967, "learning_rate": 0.0002, "epoch": 1.3318856315598986, "step": 1840}, {"loss": 1.5834, "grad_norm": 0.6949955821037292, "learning_rate": 0.0002, "epoch": 1.339124140427072, "step": 1850}, {"loss": 1.4722, "grad_norm": 0.7036856412887573, "learning_rate": 0.0002, "epoch": 1.3463626492942453, "step": 1860}, {"loss": 1.333, "grad_norm": 0.722062885761261, "learning_rate": 0.0002, "epoch": 1.3536011581614187, "step": 1870}, {"loss": 1.4044, "grad_norm": 0.6098677515983582, "learning_rate": 0.0002, "epoch": 1.360839667028592, "step": 1880}, {"loss": 1.6217, "grad_norm": 0.5376402735710144, "learning_rate": 0.0002, "epoch": 1.3680781758957654, "step": 1890}, {"loss": 1.5071, "grad_norm": 0.6974610090255737, "learning_rate": 0.0002, "epoch": 1.3753166847629388, "step": 1900}, {"loss": 1.5854, "grad_norm": 0.6520763635635376, "learning_rate": 0.0002, "epoch": 1.3825551936301121, "step": 1910}, {"loss": 1.4271, "grad_norm": 0.6604374647140503, "learning_rate": 0.0002, "epoch": 1.3897937024972855, "step": 1920}, {"loss": 1.419, "grad_norm": 0.7364398241043091, "learning_rate": 0.0002, "epoch": 1.3970322113644589, "step": 1930}, {"loss": 1.4585, "grad_norm": 0.6849475502967834, "learning_rate": 0.0002, "epoch": 1.4042707202316322, "step": 1940}, {"loss": 1.5577, "grad_norm": 0.6562670469284058, "learning_rate": 0.0002, "epoch": 1.4115092290988056, "step": 1950}, {"loss": 1.4725, "grad_norm": 0.5695616006851196, "learning_rate": 0.0002, "epoch": 1.418747737965979, "step": 1960}, {"loss": 1.3088, "grad_norm": 0.5244464874267578, "learning_rate": 0.0002, "epoch": 1.4259862468331523, "step": 1970}, {"loss": 1.5069, "grad_norm": 0.6347293257713318, "learning_rate": 0.0002, "epoch": 1.4332247557003257, "step": 1980}, {"loss": 1.3502, "grad_norm": 0.5528361201286316, "learning_rate": 0.0002, "epoch": 1.440463264567499, "step": 1990}, {"loss": 1.3978, "grad_norm": 0.6987585425376892, "learning_rate": 0.0002, "epoch": 1.4477017734346724, "step": 2000}, {"loss": 1.4262, "grad_norm": 0.6568987369537354, "learning_rate": 0.0002, "epoch": 1.4549402823018458, "step": 2010}, {"loss": 1.4175, "grad_norm": 0.7665994763374329, "learning_rate": 0.0002, "epoch": 1.4621787911690192, "step": 2020}, {"loss": 1.244, "grad_norm": 0.5127707123756409, "learning_rate": 0.0002, "epoch": 1.4694173000361925, "step": 2030}, {"loss": 1.3699, "grad_norm": 0.5406824946403503, "learning_rate": 0.0002, "epoch": 1.476655808903366, "step": 2040}, {"loss": 1.3353, "grad_norm": 0.5990166664123535, "learning_rate": 0.0002, "epoch": 1.4838943177705393, "step": 2050}, {"loss": 1.2454, "grad_norm": 0.6186193823814392, "learning_rate": 0.0002, "epoch": 1.4911328266377126, "step": 2060}, {"loss": 1.428, "grad_norm": 0.6154307126998901, "learning_rate": 0.0002, "epoch": 1.498371335504886, "step": 2070}, {"loss": 1.4528, "grad_norm": 0.5606056451797485, "learning_rate": 0.0002, "epoch": 1.5056098443720594, "step": 2080}, {"loss": 1.2405, "grad_norm": 0.5006417036056519, "learning_rate": 0.0002, "epoch": 1.5128483532392327, "step": 2090}, {"loss": 1.4258, "grad_norm": 0.5968486070632935, "learning_rate": 0.0002, "epoch": 1.520086862106406, "step": 2100}, {"loss": 1.2752, "grad_norm": 0.5835496187210083, "learning_rate": 0.0002, "epoch": 1.5273253709735795, "step": 2110}, {"loss": 1.5443, "grad_norm": 0.6753535270690918, "learning_rate": 0.0002, "epoch": 1.5345638798407528, "step": 2120}, {"loss": 1.2139, "grad_norm": 0.7299720644950867, "learning_rate": 0.0002, "epoch": 1.5418023887079262, "step": 2130}, {"loss": 1.2364, "grad_norm": 0.5105988383293152, "learning_rate": 0.0002, "epoch": 1.5490408975750996, "step": 2140}, {"loss": 1.4528, "grad_norm": 0.5675431489944458, "learning_rate": 0.0002, "epoch": 1.556279406442273, "step": 2150}, {"loss": 1.4563, "grad_norm": 0.6246723532676697, "learning_rate": 0.0002, "epoch": 1.5635179153094463, "step": 2160}, {"loss": 1.5255, "grad_norm": 0.7291720509529114, "learning_rate": 0.0002, "epoch": 1.5707564241766196, "step": 2170}, {"loss": 1.5432, "grad_norm": 0.678114116191864, "learning_rate": 0.0002, "epoch": 1.577994933043793, "step": 2180}, {"loss": 1.5212, "grad_norm": 0.5136260986328125, "learning_rate": 0.0002, "epoch": 1.5852334419109664, "step": 2190}, {"loss": 1.3271, "grad_norm": 0.6359935998916626, "learning_rate": 0.0002, "epoch": 1.5924719507781397, "step": 2200}, {"loss": 1.4038, "grad_norm": 0.7650278806686401, "learning_rate": 0.0002, "epoch": 1.599710459645313, "step": 2210}, {"loss": 1.5478, "grad_norm": 0.7256110906600952, "learning_rate": 0.0002, "epoch": 1.6069489685124865, "step": 2220}, {"loss": 1.4387, "grad_norm": 0.688689649105072, "learning_rate": 0.0002, "epoch": 1.6141874773796598, "step": 2230}, {"loss": 1.4096, "grad_norm": 0.6045311093330383, "learning_rate": 0.0002, "epoch": 1.6214259862468332, "step": 2240}, {"loss": 1.4097, "grad_norm": 0.7064604163169861, "learning_rate": 0.0002, "epoch": 1.6286644951140063, "step": 2250}, {"loss": 1.3477, "grad_norm": 0.5309562087059021, "learning_rate": 0.0002, "epoch": 1.6359030039811797, "step": 2260}, {"loss": 1.4022, "grad_norm": 0.5687053203582764, "learning_rate": 0.0002, "epoch": 1.643141512848353, "step": 2270}, {"loss": 1.2977, "grad_norm": 0.535872757434845, "learning_rate": 0.0002, "epoch": 1.6503800217155264, "step": 2280}, {"loss": 1.3844, "grad_norm": 0.5502381920814514, "learning_rate": 0.0002, "epoch": 1.6576185305826998, "step": 2290}, {"loss": 1.3764, "grad_norm": 0.6158602237701416, "learning_rate": 0.0002, "epoch": 1.6648570394498732, "step": 2300}, {"loss": 1.3515, "grad_norm": 0.5804675817489624, "learning_rate": 0.0002, "epoch": 1.6720955483170465, "step": 2310}, {"loss": 1.2532, "grad_norm": 0.600742757320404, "learning_rate": 0.0002, "epoch": 1.67933405718422, "step": 2320}, {"loss": 1.477, "grad_norm": 0.7101941108703613, "learning_rate": 0.0002, "epoch": 1.6865725660513933, "step": 2330}, {"loss": 1.4849, "grad_norm": 0.7507809996604919, "learning_rate": 0.0002, "epoch": 1.6938110749185666, "step": 2340}, {"loss": 1.2703, "grad_norm": 0.768502414226532, "learning_rate": 0.0002, "epoch": 1.70104958378574, "step": 2350}, {"loss": 1.3332, "grad_norm": 0.4801851212978363, "learning_rate": 0.0002, "epoch": 1.7082880926529134, "step": 2360}, {"loss": 1.4158, "grad_norm": 0.5322122573852539, "learning_rate": 0.0002, "epoch": 1.7155266015200867, "step": 2370}, {"loss": 1.4136, "grad_norm": 0.587661862373352, "learning_rate": 0.0002, "epoch": 1.72276511038726, "step": 2380}, {"loss": 1.3771, "grad_norm": 0.6073525547981262, "learning_rate": 0.0002, "epoch": 1.7300036192544335, "step": 2390}, {"loss": 1.2754, "grad_norm": 0.6950460076332092, "learning_rate": 0.0002, "epoch": 1.7372421281216068, "step": 2400}, {"loss": 1.3858, "grad_norm": 0.5981102585792542, "learning_rate": 0.0002, "epoch": 1.7444806369887802, "step": 2410}, {"loss": 1.4075, "grad_norm": 0.544570803642273, "learning_rate": 0.0002, "epoch": 1.7517191458559536, "step": 2420}, {"loss": 1.3861, "grad_norm": 0.5304399728775024, "learning_rate": 0.0002, "epoch": 1.758957654723127, "step": 2430}, {"loss": 1.4244, "grad_norm": 0.7921594977378845, "learning_rate": 0.0002, "epoch": 1.7661961635903003, "step": 2440}, {"loss": 1.3053, "grad_norm": 0.6084808707237244, "learning_rate": 0.0002, "epoch": 1.7734346724574737, "step": 2450}, {"loss": 1.3781, "grad_norm": 0.8844701051712036, "learning_rate": 0.0002, "epoch": 1.780673181324647, "step": 2460}, {"loss": 1.3227, "grad_norm": 0.5729258060455322, "learning_rate": 0.0002, "epoch": 1.7879116901918204, "step": 2470}, {"loss": 1.3422, "grad_norm": 0.6303611993789673, "learning_rate": 0.0002, "epoch": 1.7951501990589938, "step": 2480}, {"loss": 1.3926, "grad_norm": 0.5627942085266113, "learning_rate": 0.0002, "epoch": 1.8023887079261671, "step": 2490}, {"loss": 1.3816, "grad_norm": 0.6724274158477783, "learning_rate": 0.0002, "epoch": 1.8096272167933405, "step": 2500}, {"loss": 1.2951, "grad_norm": 0.5030826330184937, "learning_rate": 0.0002, "epoch": 1.8168657256605139, "step": 2510}, {"loss": 1.2839, "grad_norm": 0.5504099130630493, "learning_rate": 0.0002, "epoch": 1.8241042345276872, "step": 2520}, {"loss": 1.4264, "grad_norm": 0.6338945627212524, "learning_rate": 0.0002, "epoch": 1.8313427433948606, "step": 2530}, {"loss": 1.563, "grad_norm": 0.5902037620544434, "learning_rate": 0.0002, "epoch": 1.838581252262034, "step": 2540}, {"loss": 1.2961, "grad_norm": 0.48814457654953003, "learning_rate": 0.0002, "epoch": 1.8458197611292073, "step": 2550}, {"loss": 1.466, "grad_norm": 0.6216312646865845, "learning_rate": 0.0002, "epoch": 1.8530582699963807, "step": 2560}, {"loss": 1.5123, "grad_norm": 0.635603666305542, "learning_rate": 0.0002, "epoch": 1.860296778863554, "step": 2570}, {"loss": 1.372, "grad_norm": 0.6938216090202332, "learning_rate": 0.0002, "epoch": 1.8675352877307274, "step": 2580}, {"loss": 1.5011, "grad_norm": 0.599557638168335, "learning_rate": 0.0002, "epoch": 1.8747737965979008, "step": 2590}, {"loss": 1.2714, "grad_norm": 0.564424455165863, "learning_rate": 0.0002, "epoch": 1.8820123054650741, "step": 2600}, {"loss": 1.3403, "grad_norm": 0.5430700182914734, "learning_rate": 0.0002, "epoch": 1.8892508143322475, "step": 2610}, {"loss": 1.4347, "grad_norm": 0.6150169372558594, "learning_rate": 0.0002, "epoch": 1.8964893231994209, "step": 2620}, {"loss": 1.2474, "grad_norm": 0.48159119486808777, "learning_rate": 0.0002, "epoch": 1.9037278320665942, "step": 2630}, {"loss": 1.3716, "grad_norm": 0.5608997941017151, "learning_rate": 0.0002, "epoch": 1.9109663409337676, "step": 2640}, {"loss": 1.5787, "grad_norm": 0.6454501748085022, "learning_rate": 0.0002, "epoch": 1.918204849800941, "step": 2650}, {"loss": 1.3238, "grad_norm": 0.5458073616027832, "learning_rate": 0.0002, "epoch": 1.9254433586681143, "step": 2660}, {"loss": 1.3208, "grad_norm": 0.5328490734100342, "learning_rate": 0.0002, "epoch": 1.9326818675352877, "step": 2670}, {"loss": 1.4971, "grad_norm": 0.6444696187973022, "learning_rate": 0.0002, "epoch": 1.939920376402461, "step": 2680}, {"loss": 1.5387, "grad_norm": 0.7126023769378662, "learning_rate": 0.0002, "epoch": 1.9471588852696344, "step": 2690}, {"loss": 1.3637, "grad_norm": 0.5164045095443726, "learning_rate": 0.0002, "epoch": 1.9543973941368078, "step": 2700}, {"loss": 1.5303, "grad_norm": 0.5347061157226562, "learning_rate": 0.0002, "epoch": 1.9616359030039812, "step": 2710}, {"loss": 1.2815, "grad_norm": 0.5297950506210327, "learning_rate": 0.0002, "epoch": 1.9688744118711545, "step": 2720}, {"loss": 1.3566, "grad_norm": 0.6537790298461914, "learning_rate": 0.0002, "epoch": 1.976112920738328, "step": 2730}, {"loss": 1.332, "grad_norm": 0.5536222457885742, "learning_rate": 0.0002, "epoch": 1.9833514296055013, "step": 2740}, {"loss": 1.3333, "grad_norm": 0.4856105446815491, "learning_rate": 0.0002, "epoch": 1.9905899384726746, "step": 2750}, {"loss": 1.3521, "grad_norm": 0.6642730832099915, "learning_rate": 0.0002, "epoch": 1.997828447339848, "step": 2760}, {"eval_loss": 1.4366681575775146, "eval_runtime": 27.3729, "eval_samples_per_second": 15.928, "eval_steps_per_second": 2.009, "epoch": 2.0, "step": 2763}, {"loss": 1.4322, "grad_norm": 0.740253210067749, "learning_rate": 0.0002, "epoch": 2.0050669562070214, "step": 2770}, {"loss": 1.277, "grad_norm": 0.5826276540756226, "learning_rate": 0.0002, "epoch": 2.0123054650741947, "step": 2780}, {"loss": 1.2424, "grad_norm": 0.607356071472168, "learning_rate": 0.0002, "epoch": 2.019543973941368, "step": 2790}, {"loss": 1.2601, "grad_norm": 0.5918063521385193, "learning_rate": 0.0002, "epoch": 2.0267824828085415, "step": 2800}, {"loss": 1.3715, "grad_norm": 0.5610089898109436, "learning_rate": 0.0002, "epoch": 2.034020991675715, "step": 2810}, {"loss": 1.2092, "grad_norm": 0.5869926810264587, "learning_rate": 0.0002, "epoch": 2.041259500542888, "step": 2820}, {"loss": 1.1929, "grad_norm": 0.5753467679023743, "learning_rate": 0.0002, "epoch": 2.0484980094100615, "step": 2830}, {"loss": 1.333, "grad_norm": 0.7096508145332336, "learning_rate": 0.0002, "epoch": 2.055736518277235, "step": 2840}, {"loss": 1.1766, "grad_norm": 0.7653635144233704, "learning_rate": 0.0002, "epoch": 2.0629750271444083, "step": 2850}, {"loss": 1.2331, "grad_norm": 0.6202841997146606, "learning_rate": 0.0002, "epoch": 2.0702135360115816, "step": 2860}, {"loss": 1.3298, "grad_norm": 0.6810227632522583, "learning_rate": 0.0002, "epoch": 2.077452044878755, "step": 2870}, {"loss": 1.2505, "grad_norm": 0.7481493353843689, "learning_rate": 0.0002, "epoch": 2.0846905537459284, "step": 2880}, {"loss": 1.2484, "grad_norm": 0.7089637517929077, "learning_rate": 0.0002, "epoch": 2.0919290626131017, "step": 2890}, {"loss": 1.3095, "grad_norm": 0.7472923398017883, "learning_rate": 0.0002, "epoch": 2.099167571480275, "step": 2900}, {"loss": 1.304, "grad_norm": 0.8135465979576111, "learning_rate": 0.0002, "epoch": 2.1064060803474485, "step": 2910}, {"loss": 1.273, "grad_norm": 0.6097133159637451, "learning_rate": 0.0002, "epoch": 2.113644589214622, "step": 2920}, {"loss": 1.3384, "grad_norm": 0.5970117449760437, "learning_rate": 0.0002, "epoch": 2.120883098081795, "step": 2930}, {"loss": 1.3233, "grad_norm": 0.6169309616088867, "learning_rate": 0.0002, "epoch": 2.1281216069489686, "step": 2940}, {"loss": 1.4246, "grad_norm": 0.9428738355636597, "learning_rate": 0.0002, "epoch": 2.135360115816142, "step": 2950}, {"loss": 1.3527, "grad_norm": 0.5671679973602295, "learning_rate": 0.0002, "epoch": 2.1425986246833153, "step": 2960}, {"loss": 1.1375, "grad_norm": 0.7007262110710144, "learning_rate": 0.0002, "epoch": 2.1498371335504887, "step": 2970}, {"loss": 1.2015, "grad_norm": 0.6294044256210327, "learning_rate": 0.0002, "epoch": 2.157075642417662, "step": 2980}, {"loss": 1.2167, "grad_norm": 0.6105241775512695, "learning_rate": 0.0002, "epoch": 2.1643141512848354, "step": 2990}, {"loss": 1.2065, "grad_norm": 0.557124137878418, "learning_rate": 0.0002, "epoch": 2.1715526601520088, "step": 3000}, {"loss": 1.2515, "grad_norm": 0.6250392198562622, "learning_rate": 0.0002, "epoch": 2.178791169019182, "step": 3010}, {"loss": 1.385, "grad_norm": 0.645218551158905, "learning_rate": 0.0002, "epoch": 2.1860296778863555, "step": 3020}, {"loss": 1.3928, "grad_norm": 0.9033605456352234, "learning_rate": 0.0002, "epoch": 2.193268186753529, "step": 3030}, {"loss": 1.2458, "grad_norm": 0.5325747132301331, "learning_rate": 0.0002, "epoch": 2.2005066956207022, "step": 3040}, {"loss": 1.261, "grad_norm": 0.6334700584411621, "learning_rate": 0.0002, "epoch": 2.2077452044878756, "step": 3050}, {"loss": 1.2385, "grad_norm": 0.5206325054168701, "learning_rate": 0.0002, "epoch": 2.214983713355049, "step": 3060}, {"loss": 1.3103, "grad_norm": 0.5987200140953064, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3070}, {"loss": 1.1756, "grad_norm": 0.5893264412879944, "learning_rate": 0.0002, "epoch": 2.2294607310893957, "step": 3080}, {"loss": 1.235, "grad_norm": 0.6869237422943115, "learning_rate": 0.0002, "epoch": 2.236699239956569, "step": 3090}, {"loss": 1.3285, "grad_norm": 0.5040048360824585, "learning_rate": 0.0002, "epoch": 2.2439377488237424, "step": 3100}, {"loss": 1.3316, "grad_norm": 0.6660613417625427, "learning_rate": 0.0002, "epoch": 2.251176257690916, "step": 3110}, {"loss": 1.3108, "grad_norm": 0.5890918970108032, "learning_rate": 0.0002, "epoch": 2.258414766558089, "step": 3120}, {"loss": 1.248, "grad_norm": 0.6458896994590759, "learning_rate": 0.0002, "epoch": 2.2656532754252625, "step": 3130}, {"loss": 1.4151, "grad_norm": 0.6832690834999084, "learning_rate": 0.0002, "epoch": 2.272891784292436, "step": 3140}, {"loss": 1.4458, "grad_norm": 0.833908200263977, "learning_rate": 0.0002, "epoch": 2.2801302931596092, "step": 3150}, {"loss": 1.2931, "grad_norm": 0.4596034586429596, "learning_rate": 0.0002, "epoch": 2.2873688020267826, "step": 3160}, {"loss": 1.449, "grad_norm": 0.9130966067314148, "learning_rate": 0.0002, "epoch": 2.294607310893956, "step": 3170}, {"loss": 1.3806, "grad_norm": 0.7143292427062988, "learning_rate": 0.0002, "epoch": 2.3018458197611293, "step": 3180}, {"loss": 1.2692, "grad_norm": 0.5388900637626648, "learning_rate": 0.0002, "epoch": 2.3090843286283027, "step": 3190}, {"loss": 1.2402, "grad_norm": 0.5607513189315796, "learning_rate": 0.0002, "epoch": 2.316322837495476, "step": 3200}, {"loss": 1.3874, "grad_norm": 0.6795142292976379, "learning_rate": 0.0002, "epoch": 2.3235613463626494, "step": 3210}, {"loss": 1.3042, "grad_norm": 0.6561070680618286, "learning_rate": 0.0002, "epoch": 2.330799855229823, "step": 3220}, {"loss": 1.4636, "grad_norm": 0.8858118057250977, "learning_rate": 0.0002, "epoch": 2.338038364096996, "step": 3230}, {"loss": 1.3214, "grad_norm": 0.6604151725769043, "learning_rate": 0.0002, "epoch": 2.3452768729641695, "step": 3240}, {"loss": 1.4004, "grad_norm": 0.6755785346031189, "learning_rate": 0.0002, "epoch": 2.352515381831343, "step": 3250}, {"loss": 1.2503, "grad_norm": 0.6981677412986755, "learning_rate": 0.0002, "epoch": 2.3597538906985163, "step": 3260}, {"loss": 1.3078, "grad_norm": 0.6338568329811096, "learning_rate": 0.0002, "epoch": 2.3669923995656896, "step": 3270}, {"loss": 1.285, "grad_norm": 0.5754265785217285, "learning_rate": 0.0002, "epoch": 2.374230908432863, "step": 3280}, {"loss": 1.2924, "grad_norm": 0.7533153295516968, "learning_rate": 0.0002, "epoch": 2.3814694173000364, "step": 3290}, {"loss": 1.3711, "grad_norm": 0.675065279006958, "learning_rate": 0.0002, "epoch": 2.3887079261672097, "step": 3300}, {"loss": 1.3548, "grad_norm": 0.5686452984809875, "learning_rate": 0.0002, "epoch": 2.395946435034383, "step": 3310}, {"loss": 1.1998, "grad_norm": 0.8129481673240662, "learning_rate": 0.0002, "epoch": 2.4031849439015565, "step": 3320}, {"loss": 1.2584, "grad_norm": 0.6615934371948242, "learning_rate": 0.0002, "epoch": 2.41042345276873, "step": 3330}, {"loss": 1.3691, "grad_norm": 0.6678834557533264, "learning_rate": 0.0002, "epoch": 2.417661961635903, "step": 3340}, {"loss": 1.2381, "grad_norm": 0.5581308007240295, "learning_rate": 0.0002, "epoch": 2.4249004705030766, "step": 3350}, {"loss": 1.3853, "grad_norm": 0.6098920106887817, "learning_rate": 0.0002, "epoch": 2.43213897937025, "step": 3360}, {"loss": 1.3692, "grad_norm": 0.8101736903190613, "learning_rate": 0.0002, "epoch": 2.4393774882374233, "step": 3370}, {"loss": 1.4418, "grad_norm": 0.6621488928794861, "learning_rate": 0.0002, "epoch": 2.4466159971045967, "step": 3380}, {"loss": 1.4579, "grad_norm": 0.8693289160728455, "learning_rate": 0.0002, "epoch": 2.45385450597177, "step": 3390}, {"loss": 1.3644, "grad_norm": 0.6724580526351929, "learning_rate": 0.0002, "epoch": 2.4610930148389434, "step": 3400}, {"loss": 1.2006, "grad_norm": 0.6776891946792603, "learning_rate": 0.0002, "epoch": 2.4683315237061167, "step": 3410}, {"loss": 1.2937, "grad_norm": 0.7214453816413879, "learning_rate": 0.0002, "epoch": 2.47557003257329, "step": 3420}, {"loss": 1.4051, "grad_norm": 0.8390451073646545, "learning_rate": 0.0002, "epoch": 2.4828085414404635, "step": 3430}, {"loss": 1.25, "grad_norm": 0.7130982279777527, "learning_rate": 0.0002, "epoch": 2.490047050307637, "step": 3440}, {"loss": 1.2231, "grad_norm": 0.8873937129974365, "learning_rate": 0.0002, "epoch": 2.49728555917481, "step": 3450}, {"loss": 1.1429, "grad_norm": 0.725185751914978, "learning_rate": 0.0002, "epoch": 2.5045240680419836, "step": 3460}, {"loss": 1.2699, "grad_norm": 0.6120352149009705, "learning_rate": 0.0002, "epoch": 2.511762576909157, "step": 3470}, {"loss": 1.2552, "grad_norm": 0.7713613510131836, "learning_rate": 0.0002, "epoch": 2.5190010857763303, "step": 3480}, {"loss": 1.4648, "grad_norm": 0.895309567451477, "learning_rate": 0.0002, "epoch": 2.5262395946435037, "step": 3490}, {"loss": 1.3043, "grad_norm": 0.9631021022796631, "learning_rate": 0.0002, "epoch": 2.533478103510677, "step": 3500}, {"loss": 1.3492, "grad_norm": 0.7475683093070984, "learning_rate": 0.0002, "epoch": 2.5407166123778504, "step": 3510}, {"loss": 1.3637, "grad_norm": 0.7271341681480408, "learning_rate": 0.0002, "epoch": 2.5479551212450238, "step": 3520}, {"loss": 1.304, "grad_norm": 0.6979510188102722, "learning_rate": 0.0002, "epoch": 2.555193630112197, "step": 3530}, {"loss": 1.2353, "grad_norm": 0.6504196524620056, "learning_rate": 0.0002, "epoch": 2.5624321389793705, "step": 3540}, {"loss": 1.2699, "grad_norm": 0.7226675748825073, "learning_rate": 0.0002, "epoch": 2.569670647846544, "step": 3550}, {"loss": 1.3002, "grad_norm": 0.6143222451210022, "learning_rate": 0.0002, "epoch": 2.5769091567137172, "step": 3560}, {"loss": 1.1585, "grad_norm": 0.7245154976844788, "learning_rate": 0.0002, "epoch": 2.5841476655808906, "step": 3570}, {"loss": 1.3651, "grad_norm": 0.943540632724762, "learning_rate": 0.0002, "epoch": 2.591386174448064, "step": 3580}, {"loss": 1.3034, "grad_norm": 0.7707241773605347, "learning_rate": 0.0002, "epoch": 2.5986246833152373, "step": 3590}, {"loss": 1.3063, "grad_norm": 0.6705001592636108, "learning_rate": 0.0002, "epoch": 2.6058631921824107, "step": 3600}, {"loss": 1.2437, "grad_norm": 0.6360933780670166, "learning_rate": 0.0002, "epoch": 2.613101701049584, "step": 3610}, {"loss": 1.1844, "grad_norm": 0.5846424698829651, "learning_rate": 0.0002, "epoch": 2.6203402099167574, "step": 3620}, {"loss": 1.3674, "grad_norm": 0.5958625674247742, "learning_rate": 0.0002, "epoch": 2.6275787187839303, "step": 3630}, {"loss": 1.3599, "grad_norm": 0.6819243431091309, "learning_rate": 0.0002, "epoch": 2.6348172276511037, "step": 3640}, {"loss": 1.3884, "grad_norm": 0.7033445835113525, "learning_rate": 0.0002, "epoch": 2.642055736518277, "step": 3650}, {"loss": 1.3392, "grad_norm": 0.6134849786758423, "learning_rate": 0.0002, "epoch": 2.6492942453854504, "step": 3660}, {"loss": 1.2661, "grad_norm": 0.658009946346283, "learning_rate": 0.0002, "epoch": 2.656532754252624, "step": 3670}, {"loss": 1.3987, "grad_norm": 0.6280999779701233, "learning_rate": 0.0002, "epoch": 2.663771263119797, "step": 3680}, {"loss": 1.2995, "grad_norm": 0.5536085963249207, "learning_rate": 0.0002, "epoch": 2.6710097719869705, "step": 3690}, {"loss": 1.2044, "grad_norm": 0.8603981733322144, "learning_rate": 0.0002, "epoch": 2.678248280854144, "step": 3700}, {"loss": 1.3879, "grad_norm": 0.5509994626045227, "learning_rate": 0.0002, "epoch": 2.6854867897213173, "step": 3710}, {"loss": 1.3253, "grad_norm": 0.9093621969223022, "learning_rate": 0.0002, "epoch": 2.6927252985884906, "step": 3720}, {"loss": 1.2668, "grad_norm": 0.7525952458381653, "learning_rate": 0.0002, "epoch": 2.699963807455664, "step": 3730}, {"loss": 1.248, "grad_norm": 0.6737023591995239, "learning_rate": 0.0002, "epoch": 2.7072023163228374, "step": 3740}, {"loss": 1.2981, "grad_norm": 0.8656924962997437, "learning_rate": 0.0002, "epoch": 2.7144408251900107, "step": 3750}, {"loss": 1.2342, "grad_norm": 0.7494133114814758, "learning_rate": 0.0002, "epoch": 2.721679334057184, "step": 3760}, {"loss": 1.2417, "grad_norm": 0.5725520849227905, "learning_rate": 0.0002, "epoch": 2.7289178429243575, "step": 3770}, {"loss": 1.28, "grad_norm": 0.836412787437439, "learning_rate": 0.0002, "epoch": 2.736156351791531, "step": 3780}, {"loss": 1.3784, "grad_norm": 0.6893242597579956, "learning_rate": 0.0002, "epoch": 2.743394860658704, "step": 3790}, {"loss": 1.2929, "grad_norm": 0.6696223020553589, "learning_rate": 0.0002, "epoch": 2.7506333695258776, "step": 3800}, {"loss": 1.2449, "grad_norm": 0.6483015418052673, "learning_rate": 0.0002, "epoch": 2.757871878393051, "step": 3810}, {"loss": 1.3282, "grad_norm": 0.8084456920623779, "learning_rate": 0.0002, "epoch": 2.7651103872602243, "step": 3820}, {"loss": 1.3694, "grad_norm": 0.6601949334144592, "learning_rate": 0.0002, "epoch": 2.7723488961273977, "step": 3830}, {"loss": 1.3568, "grad_norm": 0.6905533671379089, "learning_rate": 0.0002, "epoch": 2.779587404994571, "step": 3840}, {"loss": 1.3854, "grad_norm": 0.619318425655365, "learning_rate": 0.0002, "epoch": 2.7868259138617444, "step": 3850}, {"loss": 1.2551, "grad_norm": 0.5994023084640503, "learning_rate": 0.0002, "epoch": 2.7940644227289178, "step": 3860}, {"loss": 1.2022, "grad_norm": 0.5627168416976929, "learning_rate": 0.0002, "epoch": 2.801302931596091, "step": 3870}, {"loss": 1.3921, "grad_norm": 0.6001605987548828, "learning_rate": 0.0002, "epoch": 2.8085414404632645, "step": 3880}, {"loss": 1.3026, "grad_norm": 0.6022412776947021, "learning_rate": 0.0002, "epoch": 2.815779949330438, "step": 3890}, {"loss": 1.2765, "grad_norm": 0.6832426190376282, "learning_rate": 0.0002, "epoch": 2.823018458197611, "step": 3900}, {"loss": 1.1363, "grad_norm": 0.5936811566352844, "learning_rate": 0.0002, "epoch": 2.8302569670647846, "step": 3910}, {"loss": 1.1707, "grad_norm": 0.6960572600364685, "learning_rate": 0.0002, "epoch": 2.837495475931958, "step": 3920}, {"loss": 1.4063, "grad_norm": 0.5913406610488892, "learning_rate": 0.0002, "epoch": 2.8447339847991313, "step": 3930}, {"loss": 1.3245, "grad_norm": 0.678154706954956, "learning_rate": 0.0002, "epoch": 2.8519724936663047, "step": 3940}, {"loss": 1.366, "grad_norm": 0.7898936867713928, "learning_rate": 0.0002, "epoch": 2.859211002533478, "step": 3950}, {"loss": 1.3948, "grad_norm": 0.9234195351600647, "learning_rate": 0.0002, "epoch": 2.8664495114006514, "step": 3960}, {"loss": 1.2773, "grad_norm": 0.5960825085639954, "learning_rate": 0.0002, "epoch": 2.8736880202678248, "step": 3970}, {"loss": 1.3127, "grad_norm": 0.677118182182312, "learning_rate": 0.0002, "epoch": 2.880926529134998, "step": 3980}, {"loss": 1.2652, "grad_norm": 0.6505142450332642, "learning_rate": 0.0002, "epoch": 2.8881650380021715, "step": 3990}, {"loss": 1.2078, "grad_norm": 0.550826907157898, "learning_rate": 0.0002, "epoch": 2.895403546869345, "step": 4000}, {"loss": 1.1811, "grad_norm": 0.6209215521812439, "learning_rate": 0.0002, "epoch": 2.9026420557365182, "step": 4010}, {"loss": 1.4001, "grad_norm": 0.6549018025398254, "learning_rate": 0.0002, "epoch": 2.9098805646036916, "step": 4020}, {"loss": 1.2285, "grad_norm": 0.570682168006897, "learning_rate": 0.0002, "epoch": 2.917119073470865, "step": 4030}, {"loss": 1.0832, "grad_norm": 1.1807632446289062, "learning_rate": 0.0002, "epoch": 2.9243575823380383, "step": 4040}, {"loss": 1.2693, "grad_norm": 0.7058857679367065, "learning_rate": 0.0002, "epoch": 2.9315960912052117, "step": 4050}, {"loss": 1.2905, "grad_norm": 0.5542812943458557, "learning_rate": 0.0002, "epoch": 2.938834600072385, "step": 4060}, {"loss": 1.33, "grad_norm": 0.63167804479599, "learning_rate": 0.0002, "epoch": 2.9460731089395584, "step": 4070}, {"loss": 1.3075, "grad_norm": 0.5702962279319763, "learning_rate": 0.0002, "epoch": 2.953311617806732, "step": 4080}, {"loss": 1.2007, "grad_norm": 0.620944082736969, "learning_rate": 0.0002, "epoch": 2.960550126673905, "step": 4090}, {"loss": 1.2864, "grad_norm": 0.5866289734840393, "learning_rate": 0.0002, "epoch": 2.9677886355410785, "step": 4100}, {"loss": 1.3293, "grad_norm": 0.560170590877533, "learning_rate": 0.0002, "epoch": 2.975027144408252, "step": 4110}, {"loss": 1.2071, "grad_norm": 0.675082802772522, "learning_rate": 0.0002, "epoch": 2.9822656532754253, "step": 4120}, {"loss": 1.2981, "grad_norm": 0.62708580493927, "learning_rate": 0.0002, "epoch": 2.9895041621425986, "step": 4130}, {"loss": 1.2758, "grad_norm": 0.7893929481506348, "learning_rate": 0.0002, "epoch": 2.996742671009772, "step": 4140}, {"eval_loss": 1.4217946529388428, "eval_runtime": 27.1596, "eval_samples_per_second": 16.053, "eval_steps_per_second": 2.025, "epoch": 2.9996380745566413, "step": 4144}, {"loss": 1.2152, "grad_norm": 0.7043836116790771, "learning_rate": 0.0002, "epoch": 3.0039811798769454, "step": 4150}, {"loss": 1.1664, "grad_norm": 0.6806283593177795, "learning_rate": 0.0002, "epoch": 3.0112196887441187, "step": 4160}, {"loss": 1.292, "grad_norm": 0.7684550285339355, "learning_rate": 0.0002, "epoch": 3.018458197611292, "step": 4170}, {"loss": 1.3467, "grad_norm": 0.7895237803459167, "learning_rate": 0.0002, "epoch": 3.0256967064784654, "step": 4180}, {"loss": 1.1324, "grad_norm": 0.7464531064033508, "learning_rate": 0.0002, "epoch": 3.032935215345639, "step": 4190}, {"loss": 1.1614, "grad_norm": 0.9358500838279724, "learning_rate": 0.0002, "epoch": 3.040173724212812, "step": 4200}, {"loss": 1.1834, "grad_norm": 1.1066628694534302, "learning_rate": 0.0002, "epoch": 3.0474122330799855, "step": 4210}, {"loss": 1.1557, "grad_norm": 0.6663267612457275, "learning_rate": 0.0002, "epoch": 3.054650741947159, "step": 4220}, {"loss": 1.1707, "grad_norm": 0.6669464707374573, "learning_rate": 0.0002, "epoch": 3.0618892508143323, "step": 4230}, {"loss": 1.1841, "grad_norm": 0.7052164077758789, "learning_rate": 0.0002, "epoch": 3.0691277596815056, "step": 4240}, {"loss": 1.2913, "grad_norm": 0.6118432879447937, "learning_rate": 0.0002, "epoch": 3.076366268548679, "step": 4250}, {"loss": 1.1526, "grad_norm": 0.6915903687477112, "learning_rate": 0.0002, "epoch": 3.0836047774158524, "step": 4260}, {"loss": 1.1348, "grad_norm": 0.7441644668579102, "learning_rate": 0.0002, "epoch": 3.0908432862830257, "step": 4270}, {"loss": 1.1672, "grad_norm": 0.823850691318512, "learning_rate": 0.0002, "epoch": 3.098081795150199, "step": 4280}, {"loss": 1.2655, "grad_norm": 0.9677883386611938, "learning_rate": 0.0002, "epoch": 3.1053203040173725, "step": 4290}, {"loss": 1.1794, "grad_norm": 0.7002579569816589, "learning_rate": 0.0002, "epoch": 3.112558812884546, "step": 4300}, {"loss": 1.135, "grad_norm": 0.778789758682251, "learning_rate": 0.0002, "epoch": 3.119797321751719, "step": 4310}, {"loss": 1.0818, "grad_norm": 0.7236007452011108, "learning_rate": 0.0002, "epoch": 3.1270358306188926, "step": 4320}, {"loss": 1.1803, "grad_norm": 0.8809133768081665, "learning_rate": 0.0002, "epoch": 3.134274339486066, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.7924913167953491, "learning_rate": 0.0002, "epoch": 3.1415128483532393, "step": 4340}, {"loss": 1.1413, "grad_norm": 0.7437422275543213, "learning_rate": 0.0002, "epoch": 3.1487513572204127, "step": 4350}, {"loss": 1.2088, "grad_norm": 0.6428450345993042, "learning_rate": 0.0002, "epoch": 3.155989866087586, "step": 4360}, {"loss": 1.3032, "grad_norm": 0.7922873497009277, "learning_rate": 0.0002, "epoch": 3.1632283749547594, "step": 4370}, {"loss": 1.216, "grad_norm": 0.5252506732940674, "learning_rate": 0.0002, "epoch": 3.1704668838219328, "step": 4380}, {"loss": 1.1297, "grad_norm": 0.8570457696914673, "learning_rate": 0.0002, "epoch": 3.177705392689106, "step": 4390}, {"loss": 1.0994, "grad_norm": 0.7218987345695496, "learning_rate": 0.0002, "epoch": 3.1849439015562795, "step": 4400}, {"loss": 1.2891, "grad_norm": 0.6921393275260925, "learning_rate": 0.0002, "epoch": 3.192182410423453, "step": 4410}, {"loss": 1.2668, "grad_norm": 0.7386137843132019, "learning_rate": 0.0002, "epoch": 3.199420919290626, "step": 4420}, {"loss": 1.1654, "grad_norm": 0.6227759122848511, "learning_rate": 0.0002, "epoch": 3.2066594281577996, "step": 4430}, {"loss": 1.1752, "grad_norm": 0.7180278897285461, "learning_rate": 0.0002, "epoch": 3.213897937024973, "step": 4440}, {"loss": 1.1757, "grad_norm": 0.745830774307251, "learning_rate": 0.0002, "epoch": 3.2211364458921463, "step": 4450}, {"loss": 1.234, "grad_norm": 0.6766072511672974, "learning_rate": 0.0002, "epoch": 3.2283749547593197, "step": 4460}, {"loss": 1.1999, "grad_norm": 0.8325067162513733, "learning_rate": 0.0002, "epoch": 3.235613463626493, "step": 4470}, {"loss": 1.1606, "grad_norm": 0.7148305177688599, "learning_rate": 0.0002, "epoch": 3.2428519724936664, "step": 4480}, {"loss": 1.1383, "grad_norm": 0.7752676010131836, "learning_rate": 0.0002, "epoch": 3.25009048136084, "step": 4490}, {"loss": 1.3006, "grad_norm": 0.6776860952377319, "learning_rate": 0.0002, "epoch": 3.257328990228013, "step": 4500}, {"loss": 1.0796, "grad_norm": 0.704359769821167, "learning_rate": 0.0002, "epoch": 3.2645674990951865, "step": 4510}, {"loss": 1.2496, "grad_norm": 0.6880282163619995, "learning_rate": 0.0002, "epoch": 3.27180600796236, "step": 4520}, {"loss": 1.0947, "grad_norm": 0.8179270029067993, "learning_rate": 0.0002, "epoch": 3.2790445168295332, "step": 4530}, {"loss": 1.1909, "grad_norm": 0.6718448996543884, "learning_rate": 0.0002, "epoch": 3.2862830256967066, "step": 4540}, {"loss": 1.2708, "grad_norm": 0.8300657868385315, "learning_rate": 0.0002, "epoch": 3.29352153456388, "step": 4550}, {"loss": 1.2594, "grad_norm": 0.6433690786361694, "learning_rate": 0.0002, "epoch": 3.3007600434310533, "step": 4560}, {"loss": 1.2479, "grad_norm": 0.690262496471405, "learning_rate": 0.0002, "epoch": 3.3079985522982267, "step": 4570}, {"loss": 1.1342, "grad_norm": 0.7022852301597595, "learning_rate": 0.0002, "epoch": 3.3152370611654, "step": 4580}, {"loss": 1.0844, "grad_norm": 0.6438387632369995, "learning_rate": 0.0002, "epoch": 3.3224755700325734, "step": 4590}, {"loss": 1.17, "grad_norm": 0.6866899132728577, "learning_rate": 0.0002, "epoch": 3.329714078899747, "step": 4600}, {"loss": 1.1289, "grad_norm": 0.8233968019485474, "learning_rate": 0.0002, "epoch": 3.33695258776692, "step": 4610}, {"loss": 1.1855, "grad_norm": 0.7251574993133545, "learning_rate": 0.0002, "epoch": 3.3441910966340935, "step": 4620}, {"loss": 1.3403, "grad_norm": 0.7855110168457031, "learning_rate": 0.0002, "epoch": 3.351429605501267, "step": 4630}, {"loss": 1.2922, "grad_norm": 0.8487356305122375, "learning_rate": 0.0002, "epoch": 3.3586681143684403, "step": 4640}, {"loss": 1.2462, "grad_norm": 0.6429011225700378, "learning_rate": 0.0002, "epoch": 3.3659066232356136, "step": 4650}, {"loss": 1.129, "grad_norm": 0.7095270156860352, "learning_rate": 0.0002, "epoch": 3.373145132102787, "step": 4660}, {"loss": 1.262, "grad_norm": 0.6792303323745728, "learning_rate": 0.0002, "epoch": 3.3803836409699604, "step": 4670}, {"loss": 1.256, "grad_norm": 0.6784825921058655, "learning_rate": 0.0002, "epoch": 3.3876221498371337, "step": 4680}, {"loss": 1.0838, "grad_norm": 0.6362888216972351, "learning_rate": 0.0002, "epoch": 3.394860658704307, "step": 4690}, {"loss": 1.2165, "grad_norm": 0.7794778943061829, "learning_rate": 0.0002, "epoch": 3.4020991675714805, "step": 4700}, {"loss": 1.0644, "grad_norm": 0.7287485003471375, "learning_rate": 0.0002, "epoch": 3.409337676438654, "step": 4710}, {"loss": 1.2925, "grad_norm": 0.6481451392173767, "learning_rate": 0.0002, "epoch": 3.416576185305827, "step": 4720}, {"loss": 1.2121, "grad_norm": 0.9200371503829956, "learning_rate": 0.0002, "epoch": 3.4238146941730006, "step": 4730}, {"loss": 1.072, "grad_norm": 1.074180245399475, "learning_rate": 0.0002, "epoch": 3.431053203040174, "step": 4740}, {"loss": 1.0421, "grad_norm": 0.6722986698150635, "learning_rate": 0.0002, "epoch": 3.438291711907347, "step": 4750}, {"loss": 1.2258, "grad_norm": 0.7945933938026428, "learning_rate": 0.0002, "epoch": 3.44553022077452, "step": 4760}, {"loss": 1.0927, "grad_norm": 0.7624640464782715, "learning_rate": 0.0002, "epoch": 3.4527687296416936, "step": 4770}, {"loss": 1.2428, "grad_norm": 0.7763656377792358, "learning_rate": 0.0002, "epoch": 3.460007238508867, "step": 4780}, {"loss": 1.2584, "grad_norm": 0.7736947536468506, "learning_rate": 0.0002, "epoch": 3.4672457473760403, "step": 4790}, {"loss": 1.1953, "grad_norm": 0.8450354933738708, "learning_rate": 0.0002, "epoch": 3.4744842562432137, "step": 4800}, {"loss": 1.1362, "grad_norm": 0.6480133533477783, "learning_rate": 0.0002, "epoch": 3.481722765110387, "step": 4810}, {"loss": 1.1882, "grad_norm": 0.8437445759773254, "learning_rate": 0.0002, "epoch": 3.4889612739775604, "step": 4820}, {"loss": 1.1519, "grad_norm": 0.7781730890274048, "learning_rate": 0.0002, "epoch": 3.4961997828447338, "step": 4830}, {"loss": 1.1836, "grad_norm": 0.8523228168487549, "learning_rate": 0.0002, "epoch": 3.503438291711907, "step": 4840}, {"loss": 1.1672, "grad_norm": 0.6236732006072998, "learning_rate": 0.0002, "epoch": 3.5106768005790805, "step": 4850}, {"loss": 1.1926, "grad_norm": 0.7500787377357483, "learning_rate": 0.0002, "epoch": 3.517915309446254, "step": 4860}, {"loss": 1.1998, "grad_norm": 0.7665374875068665, "learning_rate": 0.0002, "epoch": 3.5251538183134272, "step": 4870}, {"loss": 1.1551, "grad_norm": 0.787857711315155, "learning_rate": 0.0002, "epoch": 3.5323923271806006, "step": 4880}, {"loss": 1.2758, "grad_norm": 0.970595121383667, "learning_rate": 0.0002, "epoch": 3.539630836047774, "step": 4890}, {"loss": 1.1274, "grad_norm": 0.6409347057342529, "learning_rate": 0.0002, "epoch": 3.5468693449149473, "step": 4900}, {"loss": 1.1596, "grad_norm": 0.888551652431488, "learning_rate": 0.0002, "epoch": 3.5541078537821207, "step": 4910}, {"loss": 1.1644, "grad_norm": 1.0808377265930176, "learning_rate": 0.0002, "epoch": 3.561346362649294, "step": 4920}, {"loss": 1.2564, "grad_norm": 0.7501053214073181, "learning_rate": 0.0002, "epoch": 3.5685848715164674, "step": 4930}, {"loss": 1.2351, "grad_norm": 0.7375240325927734, "learning_rate": 0.0002, "epoch": 3.575823380383641, "step": 4940}, {"loss": 1.3568, "grad_norm": 0.7075039744377136, "learning_rate": 0.0002, "epoch": 3.583061889250814, "step": 4950}, {"loss": 1.3355, "grad_norm": 0.939337432384491, "learning_rate": 0.0002, "epoch": 3.5903003981179875, "step": 4960}, {"loss": 1.1722, "grad_norm": 0.6717396974563599, "learning_rate": 0.0002, "epoch": 3.597538906985161, "step": 4970}, {"loss": 1.1186, "grad_norm": 0.7141643762588501, "learning_rate": 0.0002, "epoch": 3.6047774158523342, "step": 4980}, {"loss": 1.1011, "grad_norm": 0.7109216451644897, "learning_rate": 0.0002, "epoch": 3.6120159247195076, "step": 4990}, {"loss": 1.2178, "grad_norm": 0.7020776867866516, "learning_rate": 0.0002, "epoch": 3.619254433586681, "step": 5000}, {"loss": 1.1939, "grad_norm": 0.7158873677253723, "learning_rate": 0.0002, "epoch": 3.6264929424538543, "step": 5010}, {"loss": 1.2624, "grad_norm": 0.7062035202980042, "learning_rate": 0.0002, "epoch": 3.6337314513210277, "step": 5020}, {"loss": 1.0224, "grad_norm": 0.7081155776977539, "learning_rate": 0.0002, "epoch": 3.640969960188201, "step": 5030}, {"loss": 1.2195, "grad_norm": 1.2210607528686523, "learning_rate": 0.0002, "epoch": 3.6482084690553744, "step": 5040}, {"loss": 1.2596, "grad_norm": 0.6650236248970032, "learning_rate": 0.0002, "epoch": 3.655446977922548, "step": 5050}, {"loss": 1.1072, "grad_norm": 0.6884829998016357, "learning_rate": 0.0002, "epoch": 3.662685486789721, "step": 5060}, {"loss": 1.2292, "grad_norm": 0.7317819595336914, "learning_rate": 0.0002, "epoch": 3.6699239956568945, "step": 5070}, {"loss": 1.1917, "grad_norm": 0.7406691908836365, "learning_rate": 0.0002, "epoch": 3.677162504524068, "step": 5080}, {"loss": 1.2949, "grad_norm": 0.9009454250335693, "learning_rate": 0.0002, "epoch": 3.6844010133912413, "step": 5090}, {"loss": 1.1528, "grad_norm": 0.8189385533332825, "learning_rate": 0.0002, "epoch": 3.6916395222584146, "step": 5100}, {"loss": 1.3408, "grad_norm": 1.0793628692626953, "learning_rate": 0.0002, "epoch": 3.698878031125588, "step": 5110}, {"loss": 1.2417, "grad_norm": 0.8593027591705322, "learning_rate": 0.0002, "epoch": 3.7061165399927614, "step": 5120}, {"loss": 1.2141, "grad_norm": 0.8481812477111816, "learning_rate": 0.0002, "epoch": 3.7133550488599347, "step": 5130}, {"loss": 1.125, "grad_norm": 0.6527451276779175, "learning_rate": 0.0002, "epoch": 3.720593557727108, "step": 5140}, {"loss": 1.1584, "grad_norm": 0.9220114350318909, "learning_rate": 0.0002, "epoch": 3.7278320665942815, "step": 5150}, {"loss": 1.2267, "grad_norm": 1.0842019319534302, "learning_rate": 0.0002, "epoch": 3.735070575461455, "step": 5160}, {"loss": 1.3083, "grad_norm": 0.965453565120697, "learning_rate": 0.0002, "epoch": 3.742309084328628, "step": 5170}, {"loss": 1.1772, "grad_norm": 0.9903319478034973, "learning_rate": 0.0002, "epoch": 3.7495475931958016, "step": 5180}, {"loss": 1.2515, "grad_norm": 0.7434818148612976, "learning_rate": 0.0002, "epoch": 3.756786102062975, "step": 5190}, {"loss": 1.2631, "grad_norm": 0.6717280745506287, "learning_rate": 0.0002, "epoch": 3.7640246109301483, "step": 5200}, {"loss": 1.2012, "grad_norm": 0.7754665613174438, "learning_rate": 0.0002, "epoch": 3.7712631197973217, "step": 5210}, {"loss": 1.305, "grad_norm": 1.028374433517456, "learning_rate": 0.0002, "epoch": 3.778501628664495, "step": 5220}, {"loss": 1.1866, "grad_norm": 0.6026996374130249, "learning_rate": 0.0002, "epoch": 3.7857401375316684, "step": 5230}, {"loss": 1.1901, "grad_norm": 0.6978490948677063, "learning_rate": 0.0002, "epoch": 3.7929786463988417, "step": 5240}, {"loss": 1.2576, "grad_norm": 0.7303446531295776, "learning_rate": 0.0002, "epoch": 3.800217155266015, "step": 5250}, {"loss": 1.3173, "grad_norm": 1.0734210014343262, "learning_rate": 0.0002, "epoch": 3.8074556641331885, "step": 5260}, {"loss": 1.1137, "grad_norm": 0.6383201479911804, "learning_rate": 0.0002, "epoch": 3.814694173000362, "step": 5270}, {"loss": 1.0904, "grad_norm": 0.7742630243301392, "learning_rate": 0.0002, "epoch": 3.821932681867535, "step": 5280}, {"loss": 1.2232, "grad_norm": 0.8477074503898621, "learning_rate": 0.0002, "epoch": 3.8291711907347086, "step": 5290}, {"loss": 1.2047, "grad_norm": 0.6675317883491516, "learning_rate": 0.0002, "epoch": 3.836409699601882, "step": 5300}, {"loss": 1.2275, "grad_norm": 0.7515445351600647, "learning_rate": 0.0002, "epoch": 3.8436482084690553, "step": 5310}, {"loss": 1.2569, "grad_norm": 1.1441220045089722, "learning_rate": 0.0002, "epoch": 3.8508867173362287, "step": 5320}, {"loss": 1.1512, "grad_norm": 0.7968795895576477, "learning_rate": 0.0002, "epoch": 3.858125226203402, "step": 5330}, {"loss": 1.232, "grad_norm": 0.7842824459075928, "learning_rate": 0.0002, "epoch": 3.8653637350705754, "step": 5340}, {"loss": 1.1847, "grad_norm": 0.8272225260734558, "learning_rate": 0.0002, "epoch": 3.8726022439377488, "step": 5350}, {"loss": 1.1381, "grad_norm": 0.8413397669792175, "learning_rate": 0.0002, "epoch": 3.879840752804922, "step": 5360}, {"loss": 1.2349, "grad_norm": 1.141764760017395, "learning_rate": 0.0002, "epoch": 3.8870792616720955, "step": 5370}, {"loss": 1.212, "grad_norm": 0.9826975464820862, "learning_rate": 0.0002, "epoch": 3.894317770539269, "step": 5380}, {"loss": 1.1833, "grad_norm": 0.8598255515098572, "learning_rate": 0.0002, "epoch": 3.9015562794064422, "step": 5390}, {"loss": 1.1247, "grad_norm": 0.6271058320999146, "learning_rate": 0.0002, "epoch": 3.9087947882736156, "step": 5400}, {"loss": 1.2212, "grad_norm": 0.6379870772361755, "learning_rate": 0.0002, "epoch": 3.916033297140789, "step": 5410}, {"loss": 1.2481, "grad_norm": 1.0313376188278198, "learning_rate": 0.0002, "epoch": 3.9232718060079623, "step": 5420}, {"loss": 1.1872, "grad_norm": 0.8220619559288025, "learning_rate": 0.0002, "epoch": 3.9305103148751357, "step": 5430}, {"loss": 1.2006, "grad_norm": 0.7576116919517517, "learning_rate": 0.0002, "epoch": 3.937748823742309, "step": 5440}, {"loss": 1.1969, "grad_norm": 1.226235032081604, "learning_rate": 0.0002, "epoch": 3.9449873326094824, "step": 5450}, {"loss": 1.2945, "grad_norm": 0.7979229688644409, "learning_rate": 0.0002, "epoch": 3.952225841476656, "step": 5460}, {"loss": 1.1922, "grad_norm": 0.9911929965019226, "learning_rate": 0.0002, "epoch": 3.959464350343829, "step": 5470}, {"loss": 1.0924, "grad_norm": 0.643738865852356, "learning_rate": 0.0002, "epoch": 3.9667028592110025, "step": 5480}, {"loss": 1.0607, "grad_norm": 0.682305634021759, "learning_rate": 0.0002, "epoch": 3.973941368078176, "step": 5490}, {"loss": 1.2908, "grad_norm": 1.18373441696167, "learning_rate": 0.0002, "epoch": 3.9811798769453492, "step": 5500}, {"loss": 1.0889, "grad_norm": 0.7190203070640564, "learning_rate": 0.0002, "epoch": 3.9884183858125226, "step": 5510}, {"loss": 1.2745, "grad_norm": 0.7516948580741882, "learning_rate": 0.0002, "epoch": 3.995656894679696, "step": 5520}, {"eval_loss": 1.4252897500991821, "eval_runtime": 27.235, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 5526}, {"loss": 1.0088, "grad_norm": 0.6353074312210083, "learning_rate": 0.0002, "epoch": 4.002895403546869, "step": 5530}, {"loss": 1.0326, "grad_norm": 0.7424906492233276, "learning_rate": 0.0002, "epoch": 4.010133912414043, "step": 5540}, {"loss": 1.0667, "grad_norm": 0.8856638073921204, "learning_rate": 0.0002, "epoch": 4.017372421281216, "step": 5550}, {"loss": 1.0905, "grad_norm": 0.9627974033355713, "learning_rate": 0.0002, "epoch": 4.024610930148389, "step": 5560}, {"loss": 1.0965, "grad_norm": 0.9048978686332703, "learning_rate": 0.0002, "epoch": 4.031849439015563, "step": 5570}, {"loss": 1.1108, "grad_norm": 0.921119213104248, "learning_rate": 0.0002, "epoch": 4.039087947882736, "step": 5580}, {"loss": 1.1235, "grad_norm": 0.8654361963272095, "learning_rate": 0.0002, "epoch": 4.0463264567499095, "step": 5590}, {"loss": 1.0794, "grad_norm": 0.7947945594787598, "learning_rate": 0.0002, "epoch": 4.053564965617083, "step": 5600}, {"loss": 1.0674, "grad_norm": 0.8307326436042786, "learning_rate": 0.0002, "epoch": 4.060803474484256, "step": 5610}, {"loss": 1.0076, "grad_norm": 0.793273389339447, "learning_rate": 0.0002, "epoch": 4.06804198335143, "step": 5620}, {"loss": 1.0651, "grad_norm": 0.8748673796653748, "learning_rate": 0.0002, "epoch": 4.075280492218603, "step": 5630}, {"loss": 1.111, "grad_norm": 0.7926856279373169, "learning_rate": 0.0002, "epoch": 4.082519001085776, "step": 5640}, {"loss": 1.044, "grad_norm": 0.922645092010498, "learning_rate": 0.0002, "epoch": 4.08975750995295, "step": 5650}, {"loss": 1.109, "grad_norm": 0.9539641737937927, "learning_rate": 0.0002, "epoch": 4.096996018820123, "step": 5660}, {"loss": 1.0788, "grad_norm": 0.8674443364143372, "learning_rate": 0.0002, "epoch": 4.1042345276872965, "step": 5670}, {"loss": 0.9867, "grad_norm": 0.7097609043121338, "learning_rate": 0.0002, "epoch": 4.11147303655447, "step": 5680}, {"loss": 1.1154, "grad_norm": 0.8875522613525391, "learning_rate": 0.0002, "epoch": 4.118711545421643, "step": 5690}, {"loss": 1.1217, "grad_norm": 0.8583634495735168, "learning_rate": 0.0002, "epoch": 4.125950054288817, "step": 5700}, {"loss": 1.0973, "grad_norm": 0.6736377477645874, "learning_rate": 0.0002, "epoch": 4.13318856315599, "step": 5710}, {"loss": 1.1199, "grad_norm": 0.9349062442779541, "learning_rate": 0.0002, "epoch": 4.140427072023163, "step": 5720}, {"loss": 1.0508, "grad_norm": 1.0610365867614746, "learning_rate": 0.0002, "epoch": 4.147665580890337, "step": 5730}, {"loss": 1.1146, "grad_norm": 1.5838189125061035, "learning_rate": 0.0002, "epoch": 4.15490408975751, "step": 5740}, {"loss": 1.0222, "grad_norm": 0.747522234916687, "learning_rate": 0.0002, "epoch": 4.162142598624683, "step": 5750}, {"loss": 1.1328, "grad_norm": 1.3247915506362915, "learning_rate": 0.0002, "epoch": 4.169381107491857, "step": 5760}, {"loss": 1.1655, "grad_norm": 0.8750247955322266, "learning_rate": 0.0002, "epoch": 4.17661961635903, "step": 5770}, {"loss": 1.199, "grad_norm": 0.7914144992828369, "learning_rate": 0.0002, "epoch": 4.1838581252262035, "step": 5780}, {"loss": 1.1213, "grad_norm": 0.9493299126625061, "learning_rate": 0.0002, "epoch": 4.191096634093377, "step": 5790}, {"loss": 1.1515, "grad_norm": 0.7802295088768005, "learning_rate": 0.0002, "epoch": 4.19833514296055, "step": 5800}, {"loss": 1.0704, "grad_norm": 0.6987314820289612, "learning_rate": 0.0002, "epoch": 4.205573651827724, "step": 5810}, {"loss": 1.1699, "grad_norm": 0.9220341444015503, "learning_rate": 0.0002, "epoch": 4.212812160694897, "step": 5820}, {"loss": 1.1394, "grad_norm": 0.8932939767837524, "learning_rate": 0.0002, "epoch": 4.22005066956207, "step": 5830}, {"loss": 1.0048, "grad_norm": 0.920002818107605, "learning_rate": 0.0002, "epoch": 4.227289178429244, "step": 5840}, {"loss": 0.964, "grad_norm": 0.6662752032279968, "learning_rate": 0.0002, "epoch": 4.234527687296417, "step": 5850}, {"loss": 0.986, "grad_norm": 0.8679718971252441, "learning_rate": 0.0002, "epoch": 4.24176619616359, "step": 5860}, {"loss": 0.8991, "grad_norm": 0.7020887732505798, "learning_rate": 0.0002, "epoch": 4.249004705030764, "step": 5870}, {"loss": 1.1132, "grad_norm": 0.869611382484436, "learning_rate": 0.0002, "epoch": 4.256243213897937, "step": 5880}, {"loss": 1.1026, "grad_norm": 0.7796585559844971, "learning_rate": 0.0002, "epoch": 4.2634817227651105, "step": 5890}, {"loss": 1.0957, "grad_norm": 0.8978819251060486, "learning_rate": 0.0002, "epoch": 4.270720231632284, "step": 5900}, {"loss": 1.1325, "grad_norm": 1.0837205648422241, "learning_rate": 0.0002, "epoch": 4.277958740499457, "step": 5910}, {"loss": 1.1279, "grad_norm": 0.7584353089332581, "learning_rate": 0.0002, "epoch": 4.285197249366631, "step": 5920}, {"loss": 1.0513, "grad_norm": 0.7313185334205627, "learning_rate": 0.0002, "epoch": 4.292435758233804, "step": 5930}, {"loss": 1.1101, "grad_norm": 0.8004671335220337, "learning_rate": 0.0002, "epoch": 4.299674267100977, "step": 5940}, {"loss": 1.14, "grad_norm": 2.154958724975586, "learning_rate": 0.0002, "epoch": 4.306912775968151, "step": 5950}, {"loss": 1.1206, "grad_norm": 0.9163479804992676, "learning_rate": 0.0002, "epoch": 4.314151284835324, "step": 5960}, {"loss": 0.9941, "grad_norm": 0.9151589274406433, "learning_rate": 0.0002, "epoch": 4.321389793702497, "step": 5970}, {"loss": 1.0606, "grad_norm": 0.8624112010002136, "learning_rate": 0.0002, "epoch": 4.328628302569671, "step": 5980}, {"loss": 1.1625, "grad_norm": 0.9357741475105286, "learning_rate": 0.0002, "epoch": 4.335866811436844, "step": 5990}, {"loss": 1.0712, "grad_norm": 1.3482335805892944, "learning_rate": 0.0002, "epoch": 4.3431053203040175, "step": 6000}, {"loss": 1.1224, "grad_norm": 0.7156149744987488, "learning_rate": 0.0002, "epoch": 4.350343829171191, "step": 6010}, {"loss": 1.0753, "grad_norm": 0.8480049967765808, "learning_rate": 0.0002, "epoch": 4.357582338038364, "step": 6020}, {"loss": 1.051, "grad_norm": 0.8262244462966919, "learning_rate": 0.0002, "epoch": 4.364820846905538, "step": 6030}, {"loss": 0.9966, "grad_norm": 0.7733905911445618, "learning_rate": 0.0002, "epoch": 4.372059355772711, "step": 6040}, {"loss": 1.1008, "grad_norm": 0.8553919792175293, "learning_rate": 0.0002, "epoch": 4.379297864639884, "step": 6050}, {"loss": 1.1777, "grad_norm": 0.8666832447052002, "learning_rate": 0.0002, "epoch": 4.386536373507058, "step": 6060}, {"loss": 1.1934, "grad_norm": 0.9168295860290527, "learning_rate": 0.0002, "epoch": 4.393774882374231, "step": 6070}, {"loss": 1.0988, "grad_norm": 0.7315238118171692, "learning_rate": 0.0002, "epoch": 4.4010133912414044, "step": 6080}, {"loss": 1.1599, "grad_norm": 1.020263433456421, "learning_rate": 0.0002, "epoch": 4.408251900108578, "step": 6090}, {"loss": 1.133, "grad_norm": 0.9978243708610535, "learning_rate": 0.0002, "epoch": 4.415490408975751, "step": 6100}, {"loss": 1.1324, "grad_norm": 0.995453953742981, "learning_rate": 0.0002, "epoch": 4.4227289178429245, "step": 6110}, {"loss": 1.0957, "grad_norm": 0.9360884428024292, "learning_rate": 0.0002, "epoch": 4.429967426710098, "step": 6120}, {"loss": 0.9506, "grad_norm": 0.8099448084831238, "learning_rate": 0.0002, "epoch": 4.437205935577271, "step": 6130}, {"loss": 1.0887, "grad_norm": 0.8173841238021851, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6140}, {"loss": 1.1219, "grad_norm": 0.7972666025161743, "learning_rate": 0.0002, "epoch": 4.451682953311618, "step": 6150}, {"loss": 1.0226, "grad_norm": 0.7685779333114624, "learning_rate": 0.0002, "epoch": 4.458921462178791, "step": 6160}, {"loss": 1.0732, "grad_norm": 0.7872623801231384, "learning_rate": 0.0002, "epoch": 4.466159971045965, "step": 6170}, {"loss": 0.9911, "grad_norm": 0.7677070498466492, "learning_rate": 0.0002, "epoch": 4.473398479913138, "step": 6180}, {"loss": 1.0919, "grad_norm": 0.7878316044807434, "learning_rate": 0.0002, "epoch": 4.4806369887803115, "step": 6190}, {"loss": 1.018, "grad_norm": 0.8178079724311829, "learning_rate": 0.0002, "epoch": 4.487875497647485, "step": 6200}, {"loss": 1.0517, "grad_norm": 1.2820082902908325, "learning_rate": 0.0002, "epoch": 4.495114006514658, "step": 6210}, {"loss": 1.3101, "grad_norm": 0.9380832314491272, "learning_rate": 0.0002, "epoch": 4.502352515381832, "step": 6220}, {"loss": 0.9818, "grad_norm": 0.7810422778129578, "learning_rate": 0.0002, "epoch": 4.509591024249005, "step": 6230}, {"loss": 1.1677, "grad_norm": 1.1022917032241821, "learning_rate": 0.0002, "epoch": 4.516829533116178, "step": 6240}, {"loss": 1.1579, "grad_norm": 1.4275553226470947, "learning_rate": 0.0002, "epoch": 4.524068041983352, "step": 6250}, {"loss": 1.3237, "grad_norm": 0.7597777247428894, "learning_rate": 0.0002, "epoch": 4.531306550850525, "step": 6260}, {"loss": 1.1529, "grad_norm": 1.10992431640625, "learning_rate": 0.0002, "epoch": 4.538545059717698, "step": 6270}, {"loss": 1.0732, "grad_norm": 0.8981178998947144, "learning_rate": 0.0002, "epoch": 4.545783568584872, "step": 6280}, {"loss": 1.086, "grad_norm": 0.7863979339599609, "learning_rate": 0.0002, "epoch": 4.553022077452045, "step": 6290}, {"loss": 1.2008, "grad_norm": 0.9071474671363831, "learning_rate": 0.0002, "epoch": 4.5602605863192185, "step": 6300}, {"loss": 1.0916, "grad_norm": 0.7429424524307251, "learning_rate": 0.0002, "epoch": 4.567499095186392, "step": 6310}, {"loss": 1.095, "grad_norm": 1.0767850875854492, "learning_rate": 0.0002, "epoch": 4.574737604053565, "step": 6320}, {"loss": 1.1023, "grad_norm": 0.7885915637016296, "learning_rate": 0.0002, "epoch": 4.581976112920739, "step": 6330}, {"loss": 1.1131, "grad_norm": 0.8350457549095154, "learning_rate": 0.0002, "epoch": 4.589214621787912, "step": 6340}, {"loss": 1.0743, "grad_norm": 0.7853530645370483, "learning_rate": 0.0002, "epoch": 4.596453130655085, "step": 6350}, {"loss": 1.1912, "grad_norm": 1.1220661401748657, "learning_rate": 0.0002, "epoch": 4.603691639522259, "step": 6360}, {"loss": 1.0927, "grad_norm": 0.7959423065185547, "learning_rate": 0.0002, "epoch": 4.610930148389432, "step": 6370}, {"loss": 1.1542, "grad_norm": 0.7782652378082275, "learning_rate": 0.0002, "epoch": 4.618168657256605, "step": 6380}, {"loss": 1.0753, "grad_norm": 0.7882203459739685, "learning_rate": 0.0002, "epoch": 4.625407166123779, "step": 6390}, {"loss": 1.0676, "grad_norm": 0.8841899037361145, "learning_rate": 0.0002, "epoch": 4.632645674990952, "step": 6400}, {"loss": 1.0815, "grad_norm": 0.7936127781867981, "learning_rate": 0.0002, "epoch": 4.6398841838581255, "step": 6410}, {"loss": 1.0198, "grad_norm": 0.9213966131210327, "learning_rate": 0.0002, "epoch": 4.647122692725299, "step": 6420}, {"loss": 0.9872, "grad_norm": 0.9246473908424377, "learning_rate": 0.0002, "epoch": 4.654361201592472, "step": 6430}, {"loss": 1.1309, "grad_norm": 0.766572892665863, "learning_rate": 0.0002, "epoch": 4.661599710459646, "step": 6440}, {"loss": 1.1095, "grad_norm": 0.8596171736717224, "learning_rate": 0.0002, "epoch": 4.668838219326819, "step": 6450}, {"loss": 1.1869, "grad_norm": 0.8482751846313477, "learning_rate": 0.0002, "epoch": 4.676076728193992, "step": 6460}, {"loss": 1.0622, "grad_norm": 1.0826905965805054, "learning_rate": 0.0002, "epoch": 4.683315237061166, "step": 6470}, {"loss": 1.0256, "grad_norm": 1.1048457622528076, "learning_rate": 0.0002, "epoch": 4.690553745928339, "step": 6480}, {"loss": 1.0514, "grad_norm": 0.9429134726524353, "learning_rate": 0.0002, "epoch": 4.697792254795512, "step": 6490}, {"loss": 1.1351, "grad_norm": 0.8587502837181091, "learning_rate": 0.0002, "epoch": 4.705030763662686, "step": 6500}, {"loss": 1.0969, "grad_norm": 1.0387083292007446, "learning_rate": 0.0002, "epoch": 4.712269272529859, "step": 6510}, {"loss": 1.0493, "grad_norm": 0.7471951842308044, "learning_rate": 0.0002, "epoch": 4.7195077813970325, "step": 6520}, {"loss": 1.2632, "grad_norm": 0.8800424933433533, "learning_rate": 0.0002, "epoch": 4.726746290264206, "step": 6530}, {"loss": 1.2126, "grad_norm": 0.8136811852455139, "learning_rate": 0.0002, "epoch": 4.733984799131379, "step": 6540}, {"loss": 1.195, "grad_norm": 0.9910339713096619, "learning_rate": 0.0002, "epoch": 4.741223307998553, "step": 6550}, {"loss": 1.1201, "grad_norm": 1.0679163932800293, "learning_rate": 0.0002, "epoch": 4.748461816865726, "step": 6560}, {"loss": 1.0297, "grad_norm": 0.8468248248100281, "learning_rate": 0.0002, "epoch": 4.755700325732899, "step": 6570}, {"loss": 1.0858, "grad_norm": 0.8771235942840576, "learning_rate": 0.0002, "epoch": 4.762938834600073, "step": 6580}, {"loss": 1.077, "grad_norm": 0.7024846076965332, "learning_rate": 0.0002, "epoch": 4.770177343467246, "step": 6590}, {"loss": 1.0876, "grad_norm": 0.7836683392524719, "learning_rate": 0.0002, "epoch": 4.7774158523344195, "step": 6600}, {"loss": 1.1006, "grad_norm": 0.7717288136482239, "learning_rate": 0.0002, "epoch": 4.784654361201593, "step": 6610}, {"loss": 1.0376, "grad_norm": 0.884183943271637, "learning_rate": 0.0002, "epoch": 4.791892870068766, "step": 6620}, {"loss": 1.1757, "grad_norm": 1.383867621421814, "learning_rate": 0.0002, "epoch": 4.7991313789359396, "step": 6630}, {"loss": 1.0861, "grad_norm": 0.9741523861885071, "learning_rate": 0.0002, "epoch": 4.806369887803113, "step": 6640}, {"loss": 1.0884, "grad_norm": 0.9723693132400513, "learning_rate": 0.0002, "epoch": 4.813608396670286, "step": 6650}, {"loss": 1.2203, "grad_norm": 1.8324809074401855, "learning_rate": 0.0002, "epoch": 4.82084690553746, "step": 6660}, {"loss": 1.0292, "grad_norm": 0.904909074306488, "learning_rate": 0.0002, "epoch": 4.828085414404633, "step": 6670}, {"loss": 1.0349, "grad_norm": 0.7355411648750305, "learning_rate": 0.0002, "epoch": 4.835323923271806, "step": 6680}, {"loss": 1.0793, "grad_norm": 0.8934960961341858, "learning_rate": 0.0002, "epoch": 4.84256243213898, "step": 6690}, {"loss": 1.0375, "grad_norm": 1.4596954584121704, "learning_rate": 0.0002, "epoch": 4.849800941006153, "step": 6700}, {"loss": 1.1065, "grad_norm": 0.8310341238975525, "learning_rate": 0.0002, "epoch": 4.8570394498733265, "step": 6710}, {"loss": 1.1089, "grad_norm": 0.9709894061088562, "learning_rate": 0.0002, "epoch": 4.8642779587405, "step": 6720}, {"loss": 1.0069, "grad_norm": 0.852142333984375, "learning_rate": 0.0002, "epoch": 4.871516467607673, "step": 6730}, {"loss": 1.0507, "grad_norm": 1.0643625259399414, "learning_rate": 0.0002, "epoch": 4.878754976474847, "step": 6740}, {"loss": 1.056, "grad_norm": 0.9419508576393127, "learning_rate": 0.0002, "epoch": 4.88599348534202, "step": 6750}, {"loss": 1.1995, "grad_norm": 1.1818498373031616, "learning_rate": 0.0002, "epoch": 4.893231994209193, "step": 6760}, {"loss": 1.0925, "grad_norm": 0.9369569420814514, "learning_rate": 0.0002, "epoch": 4.900470503076367, "step": 6770}, {"loss": 1.1648, "grad_norm": 0.7012579441070557, "learning_rate": 0.0002, "epoch": 4.90770901194354, "step": 6780}, {"loss": 1.0926, "grad_norm": 0.9109319448471069, "learning_rate": 0.0002, "epoch": 4.914947520810713, "step": 6790}, {"loss": 1.0358, "grad_norm": 0.8077534437179565, "learning_rate": 0.0002, "epoch": 4.922186029677887, "step": 6800}, {"loss": 1.2549, "grad_norm": 0.7571148872375488, "learning_rate": 0.0002, "epoch": 4.92942453854506, "step": 6810}, {"loss": 0.9638, "grad_norm": 0.7325633764266968, "learning_rate": 0.0002, "epoch": 4.9366630474122335, "step": 6820}, {"loss": 1.0128, "grad_norm": 0.8465084433555603, "learning_rate": 0.0002, "epoch": 4.943901556279407, "step": 6830}, {"loss": 1.153, "grad_norm": 0.8753737807273865, "learning_rate": 0.0002, "epoch": 4.95114006514658, "step": 6840}, {"loss": 1.0247, "grad_norm": 0.9421748518943787, "learning_rate": 0.0002, "epoch": 4.958378574013754, "step": 6850}, {"loss": 1.1483, "grad_norm": 0.8245896697044373, "learning_rate": 0.0002, "epoch": 4.965617082880927, "step": 6860}, {"loss": 0.9905, "grad_norm": 0.8823089599609375, "learning_rate": 0.0002, "epoch": 4.9728555917481, "step": 6870}, {"loss": 1.1664, "grad_norm": 0.8406389355659485, "learning_rate": 0.0002, "epoch": 4.980094100615274, "step": 6880}, {"loss": 1.0944, "grad_norm": 0.9732868075370789, "learning_rate": 0.0002, "epoch": 4.987332609482447, "step": 6890}, {"loss": 1.1776, "grad_norm": 2.125141143798828, "learning_rate": 0.0002, "epoch": 4.99457111834962, "step": 6900}, {"eval_loss": 1.445176601409912, "eval_runtime": 27.2351, "eval_samples_per_second": 16.009, "eval_steps_per_second": 2.019, "epoch": 4.999638074556641, "step": 6907}, {"loss": 1.1362, "grad_norm": 0.9465792775154114, "learning_rate": 0.0002, "epoch": 5.001809627216793, "step": 6910}, {"loss": 0.982, "grad_norm": 1.2834891080856323, "learning_rate": 0.0002, "epoch": 5.009048136083966, "step": 6920}, {"loss": 0.9803, "grad_norm": 1.0297378301620483, "learning_rate": 0.0002, "epoch": 5.01628664495114, "step": 6930}, {"loss": 1.0447, "grad_norm": 1.1705161333084106, "learning_rate": 0.0002, "epoch": 5.023525153818313, "step": 6940}, {"loss": 1.0113, "grad_norm": 0.8293961882591248, "learning_rate": 0.0002, "epoch": 5.030763662685486, "step": 6950}, {"loss": 0.9203, "grad_norm": 1.0422210693359375, "learning_rate": 0.0002, "epoch": 5.03800217155266, "step": 6960}, {"loss": 1.0553, "grad_norm": 1.116104245185852, "learning_rate": 0.0002, "epoch": 5.045240680419833, "step": 6970}, {"loss": 0.9011, "grad_norm": 1.5118416547775269, "learning_rate": 0.0002, "epoch": 5.0524791892870065, "step": 6980}, {"loss": 0.9969, "grad_norm": 0.8383979797363281, "learning_rate": 0.0002, "epoch": 5.05971769815418, "step": 6990}, {"loss": 0.9659, "grad_norm": 1.3378649950027466, "learning_rate": 0.0002, "epoch": 5.066956207021353, "step": 7000}, {"loss": 1.0212, "grad_norm": 1.1840510368347168, "learning_rate": 0.0002, "epoch": 5.0741947158885266, "step": 7010}, {"loss": 0.9939, "grad_norm": 1.2354751825332642, "learning_rate": 0.0002, "epoch": 5.0814332247557, "step": 7020}, {"loss": 0.9831, "grad_norm": 1.3830451965332031, "learning_rate": 0.0002, "epoch": 5.088671733622873, "step": 7030}, {"loss": 1.1827, "grad_norm": 0.8101674318313599, "learning_rate": 0.0002, "epoch": 5.095910242490047, "step": 7040}, {"loss": 0.9255, "grad_norm": 0.897982656955719, "learning_rate": 0.0002, "epoch": 5.10314875135722, "step": 7050}, {"loss": 0.8784, "grad_norm": 1.2049678564071655, "learning_rate": 0.0002, "epoch": 5.110387260224393, "step": 7060}, {"loss": 1.0182, "grad_norm": 1.5912116765975952, "learning_rate": 0.0002, "epoch": 5.117625769091567, "step": 7070}, {"loss": 1.0909, "grad_norm": 0.9261530041694641, "learning_rate": 0.0002, "epoch": 5.12486427795874, "step": 7080}, {"loss": 0.9603, "grad_norm": 1.1454812288284302, "learning_rate": 0.0002, "epoch": 5.1321027868259135, "step": 7090}, {"loss": 0.9149, "grad_norm": 1.0049978494644165, "learning_rate": 0.0002, "epoch": 5.139341295693087, "step": 7100}, {"loss": 0.9463, "grad_norm": 1.4513251781463623, "learning_rate": 0.0002, "epoch": 5.14657980456026, "step": 7110}, {"loss": 0.8995, "grad_norm": 0.9800849556922913, "learning_rate": 0.0002, "epoch": 5.153818313427434, "step": 7120}, {"loss": 0.9835, "grad_norm": 0.9698708653450012, "learning_rate": 0.0002, "epoch": 5.161056822294607, "step": 7130}, {"loss": 0.9672, "grad_norm": 1.1126646995544434, "learning_rate": 0.0002, "epoch": 5.16829533116178, "step": 7140}, {"loss": 0.9384, "grad_norm": 0.9248330593109131, "learning_rate": 0.0002, "epoch": 5.175533840028954, "step": 7150}, {"loss": 0.826, "grad_norm": 0.7967255711555481, "learning_rate": 0.0002, "epoch": 5.182772348896127, "step": 7160}, {"loss": 1.0078, "grad_norm": 0.9933333992958069, "learning_rate": 0.0002, "epoch": 5.1900108577633, "step": 7170}, {"loss": 1.0276, "grad_norm": 1.0080649852752686, "learning_rate": 0.0002, "epoch": 5.197249366630474, "step": 7180}, {"loss": 1.0201, "grad_norm": 1.3954921960830688, "learning_rate": 0.0002, "epoch": 5.204487875497647, "step": 7190}, {"loss": 1.0863, "grad_norm": 1.2386271953582764, "learning_rate": 0.0002, "epoch": 5.2117263843648205, "step": 7200}, {"loss": 0.8863, "grad_norm": 1.2379488945007324, "learning_rate": 0.0002, "epoch": 5.218964893231994, "step": 7210}, {"loss": 1.0518, "grad_norm": 0.9882503747940063, "learning_rate": 0.0002, "epoch": 5.226203402099167, "step": 7220}, {"loss": 0.9834, "grad_norm": 1.1728729009628296, "learning_rate": 0.0002, "epoch": 5.233441910966341, "step": 7230}, {"loss": 0.9269, "grad_norm": 0.9849673509597778, "learning_rate": 0.0002, "epoch": 5.240680419833514, "step": 7240}, {"loss": 0.9935, "grad_norm": 1.177639365196228, "learning_rate": 0.0002, "epoch": 5.247918928700687, "step": 7250}, {"loss": 1.0639, "grad_norm": 1.2395055294036865, "learning_rate": 0.0002, "epoch": 5.255157437567861, "step": 7260}, {"loss": 1.0138, "grad_norm": 1.3999171257019043, "learning_rate": 0.0002, "epoch": 5.262395946435034, "step": 7270}, {"loss": 0.9745, "grad_norm": 0.7698732018470764, "learning_rate": 0.0002, "epoch": 5.269634455302207, "step": 7280}, {"loss": 1.0389, "grad_norm": 0.9167453646659851, "learning_rate": 0.0002, "epoch": 5.276872964169381, "step": 7290}, {"loss": 0.9858, "grad_norm": 1.113830804824829, "learning_rate": 0.0002, "epoch": 5.284111473036554, "step": 7300}, {"loss": 0.9577, "grad_norm": 0.9644396901130676, "learning_rate": 0.0002, "epoch": 5.2913499819037275, "step": 7310}, {"loss": 1.0556, "grad_norm": 1.462435007095337, "learning_rate": 0.0002, "epoch": 5.298588490770901, "step": 7320}, {"loss": 0.871, "grad_norm": 0.9406287670135498, "learning_rate": 0.0002, "epoch": 5.305826999638074, "step": 7330}, {"loss": 1.0022, "grad_norm": 0.9698247909545898, "learning_rate": 0.0002, "epoch": 5.313065508505248, "step": 7340}, {"loss": 0.915, "grad_norm": 1.12003755569458, "learning_rate": 0.0002, "epoch": 5.320304017372421, "step": 7350}, {"loss": 0.9838, "grad_norm": 1.598681926727295, "learning_rate": 0.0002, "epoch": 5.327542526239594, "step": 7360}, {"loss": 1.0, "grad_norm": 1.0450010299682617, "learning_rate": 0.0002, "epoch": 5.334781035106768, "step": 7370}, {"loss": 0.9983, "grad_norm": 0.8680008053779602, "learning_rate": 0.0002, "epoch": 5.342019543973941, "step": 7380}, {"loss": 0.9851, "grad_norm": 1.0115476846694946, "learning_rate": 0.0002, "epoch": 5.349258052841114, "step": 7390}, {"loss": 1.0702, "grad_norm": 0.9589748382568359, "learning_rate": 0.0002, "epoch": 5.356496561708288, "step": 7400}, {"loss": 0.9366, "grad_norm": 0.6729998588562012, "learning_rate": 0.0002, "epoch": 5.363735070575461, "step": 7410}, {"loss": 1.0126, "grad_norm": 0.9246699213981628, "learning_rate": 0.0002, "epoch": 5.3709735794426345, "step": 7420}, {"loss": 0.9815, "grad_norm": 1.1266791820526123, "learning_rate": 0.0002, "epoch": 5.378212088309808, "step": 7430}, {"loss": 1.1166, "grad_norm": 1.8056942224502563, "learning_rate": 0.0002, "epoch": 5.385450597176981, "step": 7440}, {"loss": 0.9604, "grad_norm": 0.9802932739257812, "learning_rate": 0.0002, "epoch": 5.392689106044155, "step": 7450}, {"loss": 0.9656, "grad_norm": 1.0504707098007202, "learning_rate": 0.0002, "epoch": 5.399927614911328, "step": 7460}, {"loss": 1.0132, "grad_norm": 1.1915022134780884, "learning_rate": 0.0002, "epoch": 5.407166123778501, "step": 7470}, {"loss": 1.0041, "grad_norm": 1.1856611967086792, "learning_rate": 0.0002, "epoch": 5.414404632645675, "step": 7480}, {"loss": 0.9747, "grad_norm": 1.292152762413025, "learning_rate": 0.0002, "epoch": 5.421643141512848, "step": 7490}, {"loss": 0.9659, "grad_norm": 1.2675740718841553, "learning_rate": 0.0002, "epoch": 5.4288816503800215, "step": 7500}, {"loss": 1.0271, "grad_norm": 1.4034695625305176, "learning_rate": 0.0002, "epoch": 5.436120159247195, "step": 7510}, {"loss": 1.0318, "grad_norm": 0.984588623046875, "learning_rate": 0.0002, "epoch": 5.443358668114368, "step": 7520}, {"loss": 1.0726, "grad_norm": 0.8419108390808105, "learning_rate": 0.0002, "epoch": 5.450597176981542, "step": 7530}, {"loss": 1.0499, "grad_norm": 1.0270143747329712, "learning_rate": 0.0002, "epoch": 5.457835685848715, "step": 7540}, {"loss": 0.9804, "grad_norm": 2.2158689498901367, "learning_rate": 0.0002, "epoch": 5.465074194715888, "step": 7550}, {"loss": 0.9856, "grad_norm": 1.0740524530410767, "learning_rate": 0.0002, "epoch": 5.472312703583062, "step": 7560}, {"loss": 1.0522, "grad_norm": 1.3804482221603394, "learning_rate": 0.0002, "epoch": 5.479551212450235, "step": 7570}, {"loss": 1.0297, "grad_norm": 0.9428979754447937, "learning_rate": 0.0002, "epoch": 5.486789721317408, "step": 7580}, {"loss": 1.0906, "grad_norm": 0.9548295736312866, "learning_rate": 0.0002, "epoch": 5.494028230184582, "step": 7590}, {"loss": 0.8853, "grad_norm": 1.0691065788269043, "learning_rate": 0.0002, "epoch": 5.501266739051755, "step": 7600}, {"loss": 1.0375, "grad_norm": 1.0987380743026733, "learning_rate": 0.0002, "epoch": 5.5085052479189285, "step": 7610}, {"loss": 1.0162, "grad_norm": 0.9483979344367981, "learning_rate": 0.0002, "epoch": 5.515743756786102, "step": 7620}, {"loss": 1.105, "grad_norm": 1.16624915599823, "learning_rate": 0.0002, "epoch": 5.522982265653275, "step": 7630}, {"loss": 0.8695, "grad_norm": 0.8563777208328247, "learning_rate": 0.0002, "epoch": 5.530220774520449, "step": 7640}, {"loss": 0.9297, "grad_norm": 1.268186092376709, "learning_rate": 0.0002, "epoch": 5.537459283387622, "step": 7650}, {"loss": 1.1152, "grad_norm": 1.0752092599868774, "learning_rate": 0.0002, "epoch": 5.544697792254795, "step": 7660}, {"loss": 0.9344, "grad_norm": 1.210389256477356, "learning_rate": 0.0002, "epoch": 5.551936301121969, "step": 7670}, {"loss": 1.0349, "grad_norm": 1.669063925743103, "learning_rate": 0.0002, "epoch": 5.559174809989142, "step": 7680}, {"loss": 0.9833, "grad_norm": 1.038020133972168, "learning_rate": 0.0002, "epoch": 5.566413318856315, "step": 7690}, {"loss": 0.8907, "grad_norm": 1.316673994064331, "learning_rate": 0.0002, "epoch": 5.573651827723489, "step": 7700}, {"loss": 0.9614, "grad_norm": 1.029935359954834, "learning_rate": 0.0002, "epoch": 5.580890336590662, "step": 7710}, {"loss": 1.0409, "grad_norm": 0.9401940703392029, "learning_rate": 0.0002, "epoch": 5.5881288454578355, "step": 7720}, {"loss": 0.9272, "grad_norm": 2.4811816215515137, "learning_rate": 0.0002, "epoch": 5.595367354325009, "step": 7730}, {"loss": 0.992, "grad_norm": 1.0329105854034424, "learning_rate": 0.0002, "epoch": 5.602605863192182, "step": 7740}, {"loss": 0.9493, "grad_norm": 1.479629635810852, "learning_rate": 0.0002, "epoch": 5.609844372059356, "step": 7750}, {"loss": 1.0727, "grad_norm": 1.9232319593429565, "learning_rate": 0.0002, "epoch": 5.617082880926529, "step": 7760}, {"loss": 1.0741, "grad_norm": 1.0055509805679321, "learning_rate": 0.0002, "epoch": 5.624321389793702, "step": 7770}, {"loss": 1.0731, "grad_norm": 1.0037437677383423, "learning_rate": 0.0002, "epoch": 5.631559898660876, "step": 7780}, {"loss": 1.0913, "grad_norm": 1.4245030879974365, "learning_rate": 0.0002, "epoch": 5.638798407528049, "step": 7790}, {"loss": 0.9711, "grad_norm": 1.080687403678894, "learning_rate": 0.0002, "epoch": 5.646036916395222, "step": 7800}, {"loss": 1.0276, "grad_norm": 1.354953408241272, "learning_rate": 0.0002, "epoch": 5.653275425262396, "step": 7810}, {"loss": 1.0534, "grad_norm": 0.8966761231422424, "learning_rate": 0.0002, "epoch": 5.660513934129569, "step": 7820}, {"loss": 1.0662, "grad_norm": 1.0675480365753174, "learning_rate": 0.0002, "epoch": 5.6677524429967425, "step": 7830}, {"loss": 1.1077, "grad_norm": 1.2104216814041138, "learning_rate": 0.0002, "epoch": 5.674990951863916, "step": 7840}, {"loss": 0.9627, "grad_norm": 1.105790376663208, "learning_rate": 0.0002, "epoch": 5.682229460731089, "step": 7850}, {"loss": 1.0483, "grad_norm": 1.0915391445159912, "learning_rate": 0.0002, "epoch": 5.689467969598263, "step": 7860}, {"loss": 1.0291, "grad_norm": 0.8957812786102295, "learning_rate": 0.0002, "epoch": 5.696706478465436, "step": 7870}, {"loss": 0.9785, "grad_norm": 1.9189311265945435, "learning_rate": 0.0002, "epoch": 5.703944987332609, "step": 7880}, {"loss": 1.0076, "grad_norm": 1.0867321491241455, "learning_rate": 0.0002, "epoch": 5.711183496199783, "step": 7890}, {"loss": 1.0236, "grad_norm": 1.0233147144317627, "learning_rate": 0.0002, "epoch": 5.718422005066956, "step": 7900}, {"loss": 0.9872, "grad_norm": 1.16460382938385, "learning_rate": 0.0002, "epoch": 5.7256605139341294, "step": 7910}, {"loss": 1.0762, "grad_norm": 1.1098358631134033, "learning_rate": 0.0002, "epoch": 5.732899022801303, "step": 7920}, {"loss": 0.9937, "grad_norm": 0.8555701375007629, "learning_rate": 0.0002, "epoch": 5.740137531668476, "step": 7930}, {"loss": 1.0081, "grad_norm": 0.9885705709457397, "learning_rate": 0.0002, "epoch": 5.7473760405356495, "step": 7940}, {"loss": 0.9909, "grad_norm": 0.9184203147888184, "learning_rate": 0.0002, "epoch": 5.754614549402823, "step": 7950}, {"loss": 1.0767, "grad_norm": 0.9653698205947876, "learning_rate": 0.0002, "epoch": 5.761853058269996, "step": 7960}, {"loss": 0.9317, "grad_norm": 1.0014251470565796, "learning_rate": 0.0002, "epoch": 5.76909156713717, "step": 7970}, {"loss": 1.0271, "grad_norm": 1.004701018333435, "learning_rate": 0.0002, "epoch": 5.776330076004343, "step": 7980}, {"loss": 1.0397, "grad_norm": 0.950577974319458, "learning_rate": 0.0002, "epoch": 5.783568584871516, "step": 7990}, {"loss": 0.9725, "grad_norm": 1.2986834049224854, "learning_rate": 0.0002, "epoch": 5.79080709373869, "step": 8000}, {"loss": 1.039, "grad_norm": 1.3353424072265625, "learning_rate": 0.0002, "epoch": 5.798045602605863, "step": 8010}, {"loss": 1.0626, "grad_norm": 0.7650562524795532, "learning_rate": 0.0002, "epoch": 5.8052841114730365, "step": 8020}, {"loss": 1.0802, "grad_norm": 1.0156235694885254, "learning_rate": 0.0002, "epoch": 5.81252262034021, "step": 8030}, {"loss": 1.0185, "grad_norm": 1.3092900514602661, "learning_rate": 0.0002, "epoch": 5.819761129207383, "step": 8040}, {"loss": 0.9905, "grad_norm": 1.184428095817566, "learning_rate": 0.0002, "epoch": 5.826999638074557, "step": 8050}, {"loss": 1.0548, "grad_norm": 0.979401707649231, "learning_rate": 0.0002, "epoch": 5.83423814694173, "step": 8060}, {"loss": 0.9721, "grad_norm": 1.3557400703430176, "learning_rate": 0.0002, "epoch": 5.841476655808903, "step": 8070}, {"loss": 1.0235, "grad_norm": 0.8429333567619324, "learning_rate": 0.0002, "epoch": 5.848715164676077, "step": 8080}, {"loss": 0.952, "grad_norm": 1.3167692422866821, "learning_rate": 0.0002, "epoch": 5.85595367354325, "step": 8090}, {"loss": 0.9609, "grad_norm": 0.9750998020172119, "learning_rate": 0.0002, "epoch": 5.863192182410423, "step": 8100}, {"loss": 1.0789, "grad_norm": 1.1869813203811646, "learning_rate": 0.0002, "epoch": 5.870430691277597, "step": 8110}, {"loss": 1.0331, "grad_norm": 1.508615255355835, "learning_rate": 0.0002, "epoch": 5.87766920014477, "step": 8120}, {"loss": 1.0171, "grad_norm": 0.9439908266067505, "learning_rate": 0.0002, "epoch": 5.8849077090119435, "step": 8130}, {"loss": 0.9682, "grad_norm": 0.910508930683136, "learning_rate": 0.0002, "epoch": 5.892146217879117, "step": 8140}, {"loss": 1.0032, "grad_norm": 1.111501932144165, "learning_rate": 0.0002, "epoch": 5.89938472674629, "step": 8150}, {"loss": 1.0266, "grad_norm": 0.726554274559021, "learning_rate": 0.0002, "epoch": 5.906623235613464, "step": 8160}, {"loss": 1.0681, "grad_norm": 1.1084556579589844, "learning_rate": 0.0002, "epoch": 5.913861744480637, "step": 8170}, {"loss": 0.969, "grad_norm": 0.9695167541503906, "learning_rate": 0.0002, "epoch": 5.92110025334781, "step": 8180}, {"loss": 0.9858, "grad_norm": 1.1169592142105103, "learning_rate": 0.0002, "epoch": 5.928338762214984, "step": 8190}, {"loss": 1.0924, "grad_norm": 1.5116780996322632, "learning_rate": 0.0002, "epoch": 5.935577271082157, "step": 8200}, {"loss": 0.878, "grad_norm": 1.0073388814926147, "learning_rate": 0.0002, "epoch": 5.94281577994933, "step": 8210}, {"loss": 1.0462, "grad_norm": 0.9323263168334961, "learning_rate": 0.0002, "epoch": 5.950054288816504, "step": 8220}, {"loss": 1.0291, "grad_norm": 0.9422887563705444, "learning_rate": 0.0002, "epoch": 5.957292797683677, "step": 8230}, {"loss": 0.953, "grad_norm": 0.9691047668457031, "learning_rate": 0.0002, "epoch": 5.9645313065508505, "step": 8240}, {"loss": 0.9842, "grad_norm": 0.9650622606277466, "learning_rate": 0.0002, "epoch": 5.971769815418024, "step": 8250}, {"loss": 0.907, "grad_norm": 1.077958345413208, "learning_rate": 0.0002, "epoch": 5.979008324285197, "step": 8260}, {"loss": 0.9162, "grad_norm": 0.8946306109428406, "learning_rate": 0.0002, "epoch": 5.986246833152371, "step": 8270}, {"loss": 1.0439, "grad_norm": 1.34098219871521, "learning_rate": 0.0002, "epoch": 5.993485342019544, "step": 8280}, {"eval_loss": 1.4714229106903076, "eval_runtime": 26.301, "eval_samples_per_second": 16.577, "eval_steps_per_second": 2.091, "epoch": 6.0, "step": 8289}, {"loss": 1.1403, "grad_norm": 0.9737564325332642, "learning_rate": 0.0002, "epoch": 6.000723850886717, "step": 8290}, {"loss": 0.8875, "grad_norm": 1.2205945253372192, "learning_rate": 0.0002, "epoch": 6.007962359753891, "step": 8300}, {"loss": 0.8623, "grad_norm": 1.3529434204101562, "learning_rate": 0.0002, "epoch": 6.015200868621064, "step": 8310}, {"loss": 0.9427, "grad_norm": 1.2300174236297607, "learning_rate": 0.0002, "epoch": 6.022439377488237, "step": 8320}, {"loss": 0.9322, "grad_norm": 0.9248194098472595, "learning_rate": 0.0002, "epoch": 6.029677886355411, "step": 8330}, {"loss": 0.9302, "grad_norm": 1.1140035390853882, "learning_rate": 0.0002, "epoch": 6.036916395222584, "step": 8340}, {"loss": 0.8255, "grad_norm": 1.2097352743148804, "learning_rate": 0.0002, "epoch": 6.0441549040897575, "step": 8350}, {"loss": 0.8792, "grad_norm": 0.9472483396530151, "learning_rate": 0.0002, "epoch": 6.051393412956931, "step": 8360}, {"loss": 0.8865, "grad_norm": 1.0195368528366089, "learning_rate": 0.0002, "epoch": 6.058631921824104, "step": 8370}, {"loss": 0.8858, "grad_norm": 1.182735562324524, "learning_rate": 0.0002, "epoch": 6.065870430691278, "step": 8380}, {"loss": 0.9455, "grad_norm": 1.1042858362197876, "learning_rate": 0.0002, "epoch": 6.073108939558451, "step": 8390}, {"loss": 0.9723, "grad_norm": 0.8606401085853577, "learning_rate": 0.0002, "epoch": 6.080347448425624, "step": 8400}, {"loss": 0.8436, "grad_norm": 1.1015676259994507, "learning_rate": 0.0002, "epoch": 6.087585957292798, "step": 8410}, {"loss": 0.8845, "grad_norm": 1.690224289894104, "learning_rate": 0.0002, "epoch": 6.094824466159971, "step": 8420}, {"loss": 0.8484, "grad_norm": 1.1928749084472656, "learning_rate": 0.0002, "epoch": 6.1020629750271445, "step": 8430}, {"loss": 0.9546, "grad_norm": 1.0816864967346191, "learning_rate": 0.0002, "epoch": 6.109301483894318, "step": 8440}, {"loss": 0.8286, "grad_norm": 1.1638226509094238, "learning_rate": 0.0002, "epoch": 6.116539992761491, "step": 8450}, {"loss": 0.8749, "grad_norm": 1.3782968521118164, "learning_rate": 0.0002, "epoch": 6.1237785016286646, "step": 8460}, {"loss": 0.7956, "grad_norm": 1.2030094861984253, "learning_rate": 0.0002, "epoch": 6.131017010495838, "step": 8470}, {"loss": 0.8393, "grad_norm": 1.3227659463882446, "learning_rate": 0.0002, "epoch": 6.138255519363011, "step": 8480}, {"loss": 0.9175, "grad_norm": 1.104384422302246, "learning_rate": 0.0002, "epoch": 6.145494028230185, "step": 8490}, {"loss": 0.861, "grad_norm": 1.518805980682373, "learning_rate": 0.0002, "epoch": 6.152732537097358, "step": 8500}, {"loss": 0.9169, "grad_norm": 1.2029093503952026, "learning_rate": 0.0002, "epoch": 6.159971045964531, "step": 8510}, {"loss": 0.8701, "grad_norm": 1.2991217374801636, "learning_rate": 0.0002, "epoch": 6.167209554831705, "step": 8520}, {"loss": 0.9748, "grad_norm": 1.7002956867218018, "learning_rate": 0.0002, "epoch": 6.174448063698878, "step": 8530}, {"loss": 0.8881, "grad_norm": 1.6653581857681274, "learning_rate": 0.0002, "epoch": 6.1816865725660515, "step": 8540}, {"loss": 0.817, "grad_norm": 1.0493303537368774, "learning_rate": 0.0002, "epoch": 6.188925081433225, "step": 8550}, {"loss": 0.8726, "grad_norm": 1.539345622062683, "learning_rate": 0.0002, "epoch": 6.196163590300398, "step": 8560}, {"loss": 0.9452, "grad_norm": 1.2757070064544678, "learning_rate": 0.0002, "epoch": 6.203402099167572, "step": 8570}, {"loss": 0.8773, "grad_norm": 1.2416890859603882, "learning_rate": 0.0002, "epoch": 6.210640608034745, "step": 8580}, {"loss": 0.815, "grad_norm": 1.617621898651123, "learning_rate": 0.0002, "epoch": 6.217879116901918, "step": 8590}, {"loss": 0.9137, "grad_norm": 1.058962106704712, "learning_rate": 0.0002, "epoch": 6.225117625769092, "step": 8600}, {"loss": 0.8164, "grad_norm": 1.1489088535308838, "learning_rate": 0.0002, "epoch": 6.232356134636265, "step": 8610}, {"loss": 0.9476, "grad_norm": 0.9391577243804932, "learning_rate": 0.0002, "epoch": 6.239594643503438, "step": 8620}, {"loss": 0.932, "grad_norm": 1.363706111907959, "learning_rate": 0.0002, "epoch": 6.246833152370612, "step": 8630}, {"loss": 0.8917, "grad_norm": 0.779502809047699, "learning_rate": 0.0002, "epoch": 6.254071661237785, "step": 8640}, {"loss": 0.9196, "grad_norm": 2.000821590423584, "learning_rate": 0.0002, "epoch": 6.2613101701049585, "step": 8650}, {"loss": 0.9794, "grad_norm": 1.1521023511886597, "learning_rate": 0.0002, "epoch": 6.268548678972132, "step": 8660}, {"loss": 0.9147, "grad_norm": 1.3734570741653442, "learning_rate": 0.0002, "epoch": 6.275787187839305, "step": 8670}, {"loss": 0.795, "grad_norm": 0.9550670385360718, "learning_rate": 0.0002, "epoch": 6.283025696706479, "step": 8680}, {"loss": 0.9049, "grad_norm": 0.8937032222747803, "learning_rate": 0.0002, "epoch": 6.290264205573652, "step": 8690}, {"loss": 0.8526, "grad_norm": 1.3352779150009155, "learning_rate": 0.0002, "epoch": 6.297502714440825, "step": 8700}, {"loss": 0.8572, "grad_norm": 1.3057222366333008, "learning_rate": 0.0002, "epoch": 6.304741223307999, "step": 8710}, {"loss": 0.8825, "grad_norm": 0.9078314304351807, "learning_rate": 0.0002, "epoch": 6.311979732175172, "step": 8720}, {"loss": 0.8666, "grad_norm": 1.6663457155227661, "learning_rate": 0.0002, "epoch": 6.319218241042345, "step": 8730}, {"loss": 0.927, "grad_norm": 1.2043739557266235, "learning_rate": 0.0002, "epoch": 6.326456749909519, "step": 8740}, {"loss": 0.8014, "grad_norm": 0.9165967702865601, "learning_rate": 0.0002, "epoch": 6.333695258776692, "step": 8750}, {"loss": 0.9761, "grad_norm": 1.016452670097351, "learning_rate": 0.0002, "epoch": 6.3409337676438655, "step": 8760}, {"loss": 1.022, "grad_norm": 1.2209261655807495, "learning_rate": 0.0002, "epoch": 6.348172276511039, "step": 8770}, {"loss": 0.8012, "grad_norm": 1.3380663394927979, "learning_rate": 0.0002, "epoch": 6.355410785378212, "step": 8780}, {"loss": 0.9553, "grad_norm": 2.3311562538146973, "learning_rate": 0.0002, "epoch": 6.362649294245386, "step": 8790}, {"loss": 0.8676, "grad_norm": 1.0330604314804077, "learning_rate": 0.0002, "epoch": 6.369887803112559, "step": 8800}, {"loss": 0.98, "grad_norm": 0.9655511975288391, "learning_rate": 0.0002, "epoch": 6.377126311979732, "step": 8810}, {"loss": 1.0324, "grad_norm": 1.1065765619277954, "learning_rate": 0.0002, "epoch": 6.384364820846906, "step": 8820}, {"loss": 1.0078, "grad_norm": 1.2631285190582275, "learning_rate": 0.0002, "epoch": 6.391603329714079, "step": 8830}, {"loss": 0.8989, "grad_norm": 0.92459636926651, "learning_rate": 0.0002, "epoch": 6.398841838581252, "step": 8840}, {"loss": 0.8536, "grad_norm": 0.9982633590698242, "learning_rate": 0.0002, "epoch": 6.406080347448426, "step": 8850}, {"loss": 0.8949, "grad_norm": 1.0746768712997437, "learning_rate": 0.0002, "epoch": 6.413318856315599, "step": 8860}, {"loss": 0.8547, "grad_norm": 1.3024073839187622, "learning_rate": 0.0002, "epoch": 6.4205573651827725, "step": 8870}, {"loss": 0.9618, "grad_norm": 1.2764527797698975, "learning_rate": 0.0002, "epoch": 6.427795874049946, "step": 8880}, {"loss": 0.8905, "grad_norm": 0.8318809270858765, "learning_rate": 0.0002, "epoch": 6.435034382917119, "step": 8890}, {"loss": 0.917, "grad_norm": 1.7350783348083496, "learning_rate": 0.0002, "epoch": 6.442272891784293, "step": 8900}, {"loss": 1.0229, "grad_norm": 1.3430488109588623, "learning_rate": 0.0002, "epoch": 6.449511400651466, "step": 8910}, {"loss": 0.9678, "grad_norm": 1.5907495021820068, "learning_rate": 0.0002, "epoch": 6.456749909518639, "step": 8920}, {"loss": 0.9639, "grad_norm": 1.8579202890396118, "learning_rate": 0.0002, "epoch": 6.463988418385813, "step": 8930}, {"loss": 0.9302, "grad_norm": 1.2233413457870483, "learning_rate": 0.0002, "epoch": 6.471226927252986, "step": 8940}, {"loss": 0.9169, "grad_norm": 1.009103775024414, "learning_rate": 0.0002, "epoch": 6.4784654361201595, "step": 8950}, {"loss": 0.8969, "grad_norm": 1.1265181303024292, "learning_rate": 0.0002, "epoch": 6.485703944987333, "step": 8960}, {"loss": 0.8374, "grad_norm": 1.1733338832855225, "learning_rate": 0.0002, "epoch": 6.492942453854506, "step": 8970}, {"loss": 0.8764, "grad_norm": 1.0444518327713013, "learning_rate": 0.0002, "epoch": 6.50018096272168, "step": 8980}, {"loss": 0.9582, "grad_norm": 1.2296479940414429, "learning_rate": 0.0002, "epoch": 6.507419471588853, "step": 8990}, {"loss": 0.8557, "grad_norm": 1.370417833328247, "learning_rate": 0.0002, "epoch": 6.514657980456026, "step": 9000}, {"loss": 0.9787, "grad_norm": 1.4787620306015015, "learning_rate": 0.0002, "epoch": 6.5218964893232, "step": 9010}, {"loss": 0.967, "grad_norm": 0.8550514578819275, "learning_rate": 0.0002, "epoch": 6.529134998190373, "step": 9020}, {"loss": 0.9755, "grad_norm": 1.2327991724014282, "learning_rate": 0.0002, "epoch": 6.536373507057546, "step": 9030}, {"loss": 0.9248, "grad_norm": 1.0915621519088745, "learning_rate": 0.0002, "epoch": 6.54361201592472, "step": 9040}, {"loss": 1.0024, "grad_norm": 1.7243309020996094, "learning_rate": 0.0002, "epoch": 6.550850524791893, "step": 9050}, {"loss": 1.0123, "grad_norm": 0.954359769821167, "learning_rate": 0.0002, "epoch": 6.5580890336590665, "step": 9060}, {"loss": 0.8261, "grad_norm": 1.066051959991455, "learning_rate": 0.0002, "epoch": 6.56532754252624, "step": 9070}, {"loss": 0.944, "grad_norm": 1.200271487236023, "learning_rate": 0.0002, "epoch": 6.572566051393413, "step": 9080}, {"loss": 0.9788, "grad_norm": 1.4331457614898682, "learning_rate": 0.0002, "epoch": 6.579804560260587, "step": 9090}, {"loss": 1.0216, "grad_norm": 1.0892444849014282, "learning_rate": 0.0002, "epoch": 6.58704306912776, "step": 9100}, {"loss": 0.8557, "grad_norm": 1.849726915359497, "learning_rate": 0.0002, "epoch": 6.594281577994933, "step": 9110}, {"loss": 0.9495, "grad_norm": 1.1228708028793335, "learning_rate": 0.0002, "epoch": 6.601520086862107, "step": 9120}, {"loss": 1.0169, "grad_norm": 1.0928595066070557, "learning_rate": 0.0002, "epoch": 6.60875859572928, "step": 9130}, {"loss": 0.9342, "grad_norm": 1.2138155698776245, "learning_rate": 0.0002, "epoch": 6.615997104596453, "step": 9140}, {"loss": 0.8715, "grad_norm": 1.5155235528945923, "learning_rate": 0.0002, "epoch": 6.623235613463627, "step": 9150}, {"loss": 0.9806, "grad_norm": 1.3194212913513184, "learning_rate": 0.0002, "epoch": 6.6304741223308, "step": 9160}, {"loss": 0.8958, "grad_norm": 1.045623779296875, "learning_rate": 0.0002, "epoch": 6.6377126311979735, "step": 9170}, {"loss": 0.8698, "grad_norm": 0.9647570252418518, "learning_rate": 0.0002, "epoch": 6.644951140065147, "step": 9180}, {"loss": 0.8829, "grad_norm": 1.0818220376968384, "learning_rate": 0.0002, "epoch": 6.65218964893232, "step": 9190}, {"loss": 0.9745, "grad_norm": 1.2792822122573853, "learning_rate": 0.0002, "epoch": 6.659428157799494, "step": 9200}, {"loss": 0.8854, "grad_norm": 1.2764191627502441, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9210}, {"loss": 0.9709, "grad_norm": 1.0552066564559937, "learning_rate": 0.0002, "epoch": 6.67390517553384, "step": 9220}, {"loss": 0.8855, "grad_norm": 1.082476019859314, "learning_rate": 0.0002, "epoch": 6.681143684401014, "step": 9230}, {"loss": 0.9779, "grad_norm": 1.3313323259353638, "learning_rate": 0.0002, "epoch": 6.688382193268187, "step": 9240}, {"loss": 1.005, "grad_norm": 1.130048394203186, "learning_rate": 0.0002, "epoch": 6.69562070213536, "step": 9250}, {"loss": 0.9969, "grad_norm": 1.1997296810150146, "learning_rate": 0.0002, "epoch": 6.702859211002534, "step": 9260}, {"loss": 0.8691, "grad_norm": 1.0591834783554077, "learning_rate": 0.0002, "epoch": 6.710097719869707, "step": 9270}, {"loss": 0.9603, "grad_norm": 1.2722901105880737, "learning_rate": 0.0002, "epoch": 6.7173362287368805, "step": 9280}, {"loss": 0.9227, "grad_norm": 1.1150950193405151, "learning_rate": 0.0002, "epoch": 6.724574737604054, "step": 9290}, {"loss": 0.95, "grad_norm": 1.1575992107391357, "learning_rate": 0.0002, "epoch": 6.731813246471227, "step": 9300}, {"loss": 0.9822, "grad_norm": 0.9371691346168518, "learning_rate": 0.0002, "epoch": 6.739051755338401, "step": 9310}, {"loss": 0.9773, "grad_norm": 1.4924226999282837, "learning_rate": 0.0002, "epoch": 6.746290264205574, "step": 9320}, {"loss": 0.969, "grad_norm": 1.1524218320846558, "learning_rate": 0.0002, "epoch": 6.753528773072747, "step": 9330}, {"loss": 0.9271, "grad_norm": 0.9500471949577332, "learning_rate": 0.0002, "epoch": 6.760767281939921, "step": 9340}, {"loss": 0.9029, "grad_norm": 1.2062290906906128, "learning_rate": 0.0002, "epoch": 6.768005790807094, "step": 9350}, {"loss": 0.9121, "grad_norm": 1.212631106376648, "learning_rate": 0.0002, "epoch": 6.7752442996742674, "step": 9360}, {"loss": 0.8486, "grad_norm": 1.9135472774505615, "learning_rate": 0.0002, "epoch": 6.782482808541441, "step": 9370}, {"loss": 0.9332, "grad_norm": 0.9682775139808655, "learning_rate": 0.0002, "epoch": 6.789721317408614, "step": 9380}, {"loss": 0.8548, "grad_norm": 1.1405237913131714, "learning_rate": 0.0002, "epoch": 6.7969598262757875, "step": 9390}, {"loss": 0.8922, "grad_norm": 1.6855751276016235, "learning_rate": 0.0002, "epoch": 6.804198335142961, "step": 9400}, {"loss": 0.9417, "grad_norm": 1.6590169668197632, "learning_rate": 0.0002, "epoch": 6.811436844010134, "step": 9410}, {"loss": 0.868, "grad_norm": 1.8795170783996582, "learning_rate": 0.0002, "epoch": 6.818675352877308, "step": 9420}, {"loss": 0.9142, "grad_norm": 1.1087183952331543, "learning_rate": 0.0002, "epoch": 6.825913861744481, "step": 9430}, {"loss": 1.1427, "grad_norm": 1.4178446531295776, "learning_rate": 0.0002, "epoch": 6.833152370611654, "step": 9440}, {"loss": 0.8325, "grad_norm": 1.0792350769042969, "learning_rate": 0.0002, "epoch": 6.840390879478828, "step": 9450}, {"loss": 1.0078, "grad_norm": 1.2159196138381958, "learning_rate": 0.0002, "epoch": 6.847629388346001, "step": 9460}, {"loss": 0.9536, "grad_norm": 0.9998821020126343, "learning_rate": 0.0002, "epoch": 6.8548678972131745, "step": 9470}, {"loss": 0.9277, "grad_norm": 0.7940687537193298, "learning_rate": 0.0002, "epoch": 6.862106406080348, "step": 9480}, {"loss": 0.8612, "grad_norm": 0.9572826027870178, "learning_rate": 0.0002, "epoch": 6.869344914947521, "step": 9490}, {"loss": 0.9611, "grad_norm": 1.1086537837982178, "learning_rate": 0.0002, "epoch": 6.876583423814694, "step": 9500}, {"loss": 0.9276, "grad_norm": 1.1934887170791626, "learning_rate": 0.0002, "epoch": 6.883821932681867, "step": 9510}, {"loss": 0.8416, "grad_norm": 1.207324504852295, "learning_rate": 0.0002, "epoch": 6.89106044154904, "step": 9520}, {"loss": 0.9378, "grad_norm": 1.1303677558898926, "learning_rate": 0.0002, "epoch": 6.898298950416214, "step": 9530}, {"loss": 0.9599, "grad_norm": 1.4958926439285278, "learning_rate": 0.0002, "epoch": 6.905537459283387, "step": 9540}, {"loss": 0.9365, "grad_norm": 1.2141553163528442, "learning_rate": 0.0002, "epoch": 6.9127759681505605, "step": 9550}, {"loss": 1.0291, "grad_norm": 1.6544346809387207, "learning_rate": 0.0002, "epoch": 6.920014477017734, "step": 9560}, {"loss": 0.8439, "grad_norm": 1.0540320873260498, "learning_rate": 0.0002, "epoch": 6.927252985884907, "step": 9570}, {"loss": 0.9831, "grad_norm": 1.3095581531524658, "learning_rate": 0.0002, "epoch": 6.934491494752081, "step": 9580}, {"loss": 0.8694, "grad_norm": 1.4509341716766357, "learning_rate": 0.0002, "epoch": 6.941730003619254, "step": 9590}, {"loss": 0.983, "grad_norm": 1.1091740131378174, "learning_rate": 0.0002, "epoch": 6.948968512486427, "step": 9600}, {"loss": 0.9126, "grad_norm": 1.102929949760437, "learning_rate": 0.0002, "epoch": 6.956207021353601, "step": 9610}, {"loss": 0.9622, "grad_norm": 1.1377743482589722, "learning_rate": 0.0002, "epoch": 6.963445530220774, "step": 9620}, {"loss": 0.9045, "grad_norm": 1.2070361375808716, "learning_rate": 0.0002, "epoch": 6.970684039087947, "step": 9630}, {"loss": 0.9714, "grad_norm": 1.30153489112854, "learning_rate": 0.0002, "epoch": 6.977922547955121, "step": 9640}, {"loss": 0.9555, "grad_norm": 1.4641543626785278, "learning_rate": 0.0002, "epoch": 6.985161056822294, "step": 9650}, {"loss": 0.9177, "grad_norm": 1.0497819185256958, "learning_rate": 0.0002, "epoch": 6.9923995656894675, "step": 9660}, {"loss": 0.8369, "grad_norm": 1.2500354051589966, "learning_rate": 0.0002, "epoch": 6.999638074556641, "step": 9670}, {"eval_loss": 1.518465518951416, "eval_runtime": 26.4525, "eval_samples_per_second": 16.482, "eval_steps_per_second": 2.079, "epoch": 6.999638074556641, "step": 9670}, {"loss": 0.7792, "grad_norm": 1.0240943431854248, "learning_rate": 0.0002, "epoch": 7.006876583423814, "step": 9680}, {"loss": 0.7812, "grad_norm": 1.2250308990478516, "learning_rate": 0.0002, "epoch": 7.014115092290988, "step": 9690}, {"loss": 0.8463, "grad_norm": 1.397510290145874, "learning_rate": 0.0002, "epoch": 7.021353601158161, "step": 9700}, {"loss": 0.738, "grad_norm": 1.9754822254180908, "learning_rate": 0.0002, "epoch": 7.028592110025334, "step": 9710}, {"loss": 0.8017, "grad_norm": 1.7932360172271729, "learning_rate": 0.0002, "epoch": 7.035830618892508, "step": 9720}, {"loss": 0.7427, "grad_norm": 0.8552590608596802, "learning_rate": 0.0002, "epoch": 7.043069127759681, "step": 9730}, {"loss": 0.8204, "grad_norm": 1.758886694908142, "learning_rate": 0.0002, "epoch": 7.0503076366268544, "step": 9740}, {"loss": 0.8444, "grad_norm": 1.5239284038543701, "learning_rate": 0.0002, "epoch": 7.057546145494028, "step": 9750}, {"loss": 0.9078, "grad_norm": 1.2506821155548096, "learning_rate": 0.0002, "epoch": 7.064784654361201, "step": 9760}, {"loss": 0.7512, "grad_norm": 1.274202823638916, "learning_rate": 0.0002, "epoch": 7.0720231632283745, "step": 9770}, {"loss": 0.7693, "grad_norm": 1.296419620513916, "learning_rate": 0.0002, "epoch": 7.079261672095548, "step": 9780}, {"loss": 0.8311, "grad_norm": 1.3418468236923218, "learning_rate": 0.0002, "epoch": 7.086500180962721, "step": 9790}, {"loss": 0.8302, "grad_norm": 1.1935685873031616, "learning_rate": 0.0002, "epoch": 7.093738689829895, "step": 9800}, {"loss": 0.8037, "grad_norm": 1.2341830730438232, "learning_rate": 0.0002, "epoch": 7.100977198697068, "step": 9810}, {"loss": 0.7887, "grad_norm": 1.3398581743240356, "learning_rate": 0.0002, "epoch": 7.108215707564241, "step": 9820}, {"loss": 0.824, "grad_norm": 1.1919665336608887, "learning_rate": 0.0002, "epoch": 7.115454216431415, "step": 9830}, {"loss": 0.7296, "grad_norm": 0.9331274032592773, "learning_rate": 0.0002, "epoch": 7.122692725298588, "step": 9840}, {"loss": 0.8208, "grad_norm": 1.0933221578598022, "learning_rate": 0.0002, "epoch": 7.1299312341657615, "step": 9850}, {"loss": 0.7717, "grad_norm": 1.0350896120071411, "learning_rate": 0.0002, "epoch": 7.137169743032935, "step": 9860}, {"loss": 0.903, "grad_norm": 1.334342360496521, "learning_rate": 0.0002, "epoch": 7.144408251900108, "step": 9870}, {"loss": 0.7759, "grad_norm": 1.5271754264831543, "learning_rate": 0.0002, "epoch": 7.151646760767282, "step": 9880}, {"loss": 0.8223, "grad_norm": 1.0453001260757446, "learning_rate": 0.0002, "epoch": 7.158885269634455, "step": 9890}, {"loss": 0.8334, "grad_norm": 1.204174518585205, "learning_rate": 0.0002, "epoch": 7.166123778501628, "step": 9900}, {"loss": 0.82, "grad_norm": 1.0774344205856323, "learning_rate": 0.0002, "epoch": 7.173362287368802, "step": 9910}, {"loss": 0.7772, "grad_norm": 1.282188892364502, "learning_rate": 0.0002, "epoch": 7.180600796235975, "step": 9920}, {"loss": 0.7619, "grad_norm": 1.1413695812225342, "learning_rate": 0.0002, "epoch": 7.187839305103148, "step": 9930}, {"loss": 0.7194, "grad_norm": 1.2970763444900513, "learning_rate": 0.0002, "epoch": 7.195077813970322, "step": 9940}, {"loss": 0.857, "grad_norm": 1.2535417079925537, "learning_rate": 0.0002, "epoch": 7.202316322837495, "step": 9950}, {"loss": 0.839, "grad_norm": 1.3520581722259521, "learning_rate": 0.0002, "epoch": 7.2095548317046685, "step": 9960}, {"loss": 0.8308, "grad_norm": 1.288572072982788, "learning_rate": 0.0002, "epoch": 7.216793340571842, "step": 9970}, {"loss": 0.8102, "grad_norm": 1.4298021793365479, "learning_rate": 0.0002, "epoch": 7.224031849439015, "step": 9980}, {"loss": 0.9437, "grad_norm": 1.4797194004058838, "learning_rate": 0.0002, "epoch": 7.231270358306189, "step": 9990}, {"loss": 0.8486, "grad_norm": 1.5020978450775146, "learning_rate": 0.0002, "epoch": 7.238508867173362, "step": 10000}, {"loss": 0.8584, "grad_norm": 1.1417840719223022, "learning_rate": 0.0002, "epoch": 7.245747376040535, "step": 10010}, {"loss": 0.8547, "grad_norm": 1.746782898902893, "learning_rate": 0.0002, "epoch": 7.252985884907709, "step": 10020}, {"loss": 0.8721, "grad_norm": 1.019222617149353, "learning_rate": 0.0002, "epoch": 7.260224393774882, "step": 10030}, {"loss": 0.836, "grad_norm": 1.3712849617004395, "learning_rate": 0.0002, "epoch": 7.267462902642055, "step": 10040}, {"loss": 0.9228, "grad_norm": 1.5264288187026978, "learning_rate": 0.0002, "epoch": 7.274701411509229, "step": 10050}, {"loss": 0.8803, "grad_norm": 1.2784953117370605, "learning_rate": 0.0002, "epoch": 7.281939920376402, "step": 10060}, {"loss": 0.9144, "grad_norm": 1.0246731042861938, "learning_rate": 0.0002, "epoch": 7.2891784292435755, "step": 10070}, {"loss": 0.7846, "grad_norm": 1.2060108184814453, "learning_rate": 0.0002, "epoch": 7.296416938110749, "step": 10080}, {"loss": 0.8715, "grad_norm": 1.0908410549163818, "learning_rate": 0.0002, "epoch": 7.303655446977922, "step": 10090}, {"loss": 0.7516, "grad_norm": 1.2308661937713623, "learning_rate": 0.0002, "epoch": 7.310893955845096, "step": 10100}, {"loss": 0.7477, "grad_norm": 1.185610055923462, "learning_rate": 0.0002, "epoch": 7.318132464712269, "step": 10110}, {"loss": 0.8495, "grad_norm": 1.0026527643203735, "learning_rate": 0.0002, "epoch": 7.325370973579442, "step": 10120}, {"loss": 0.9155, "grad_norm": 1.3346470594406128, "learning_rate": 0.0002, "epoch": 7.332609482446616, "step": 10130}, {"loss": 0.975, "grad_norm": 1.5946321487426758, "learning_rate": 0.0002, "epoch": 7.339847991313789, "step": 10140}, {"loss": 0.7774, "grad_norm": 1.3622175455093384, "learning_rate": 0.0002, "epoch": 7.347086500180962, "step": 10150}, {"loss": 0.8455, "grad_norm": 1.0937085151672363, "learning_rate": 0.0002, "epoch": 7.354325009048136, "step": 10160}, {"loss": 0.8548, "grad_norm": 1.6057474613189697, "learning_rate": 0.0002, "epoch": 7.361563517915309, "step": 10170}, {"loss": 0.8345, "grad_norm": 1.234887719154358, "learning_rate": 0.0002, "epoch": 7.3688020267824825, "step": 10180}, {"loss": 0.9041, "grad_norm": 1.2238616943359375, "learning_rate": 0.0002, "epoch": 7.376040535649656, "step": 10190}, {"loss": 0.8568, "grad_norm": 1.2640055418014526, "learning_rate": 0.0002, "epoch": 7.383279044516829, "step": 10200}, {"loss": 0.8233, "grad_norm": 1.2917805910110474, "learning_rate": 0.0002, "epoch": 7.390517553384003, "step": 10210}, {"loss": 0.8748, "grad_norm": 1.096583366394043, "learning_rate": 0.0002, "epoch": 7.397756062251176, "step": 10220}, {"loss": 0.7882, "grad_norm": 1.1854201555252075, "learning_rate": 0.0002, "epoch": 7.404994571118349, "step": 10230}, {"loss": 0.7746, "grad_norm": 1.2318766117095947, "learning_rate": 0.0002, "epoch": 7.412233079985523, "step": 10240}, {"loss": 0.9144, "grad_norm": 1.395302414894104, "learning_rate": 0.0002, "epoch": 7.419471588852696, "step": 10250}, {"loss": 0.7673, "grad_norm": 1.118148922920227, "learning_rate": 0.0002, "epoch": 7.4267100977198695, "step": 10260}, {"loss": 0.8743, "grad_norm": 1.1969468593597412, "learning_rate": 0.0002, "epoch": 7.433948606587043, "step": 10270}, {"loss": 0.8955, "grad_norm": 1.434050440788269, "learning_rate": 0.0002, "epoch": 7.441187115454216, "step": 10280}, {"loss": 0.7716, "grad_norm": 1.2344770431518555, "learning_rate": 0.0002, "epoch": 7.4484256243213895, "step": 10290}, {"loss": 0.801, "grad_norm": 1.2186434268951416, "learning_rate": 0.0002, "epoch": 7.455664133188563, "step": 10300}, {"loss": 0.8937, "grad_norm": 1.482475757598877, "learning_rate": 0.0002, "epoch": 7.462902642055736, "step": 10310}, {"loss": 0.7954, "grad_norm": 1.8391777276992798, "learning_rate": 0.0002, "epoch": 7.47014115092291, "step": 10320}, {"loss": 0.8711, "grad_norm": 1.9489128589630127, "learning_rate": 0.0002, "epoch": 7.477379659790083, "step": 10330}, {"loss": 0.8455, "grad_norm": 1.369743824005127, "learning_rate": 0.0002, "epoch": 7.484618168657256, "step": 10340}, {"loss": 0.9501, "grad_norm": 1.3188602924346924, "learning_rate": 0.0002, "epoch": 7.49185667752443, "step": 10350}, {"loss": 0.8293, "grad_norm": 1.1885292530059814, "learning_rate": 0.0002, "epoch": 7.499095186391603, "step": 10360}, {"loss": 0.8832, "grad_norm": 1.4873403310775757, "learning_rate": 0.0002, "epoch": 7.5063336952587765, "step": 10370}, {"loss": 0.8531, "grad_norm": 1.8681598901748657, "learning_rate": 0.0002, "epoch": 7.51357220412595, "step": 10380}, {"loss": 0.8172, "grad_norm": 1.398186445236206, "learning_rate": 0.0002, "epoch": 7.520810712993123, "step": 10390}, {"loss": 0.9239, "grad_norm": 1.272192358970642, "learning_rate": 0.0002, "epoch": 7.528049221860297, "step": 10400}, {"loss": 0.818, "grad_norm": 0.9671797752380371, "learning_rate": 0.0002, "epoch": 7.53528773072747, "step": 10410}, {"loss": 0.8393, "grad_norm": 0.9752382040023804, "learning_rate": 0.0002, "epoch": 7.542526239594643, "step": 10420}, {"loss": 0.8011, "grad_norm": 1.2241966724395752, "learning_rate": 0.0002, "epoch": 7.549764748461817, "step": 10430}, {"loss": 0.8497, "grad_norm": 1.4615166187286377, "learning_rate": 0.0002, "epoch": 7.55700325732899, "step": 10440}, {"loss": 0.8475, "grad_norm": 1.123205542564392, "learning_rate": 0.0002, "epoch": 7.564241766196163, "step": 10450}, {"loss": 0.893, "grad_norm": 1.3798918724060059, "learning_rate": 0.0002, "epoch": 7.571480275063337, "step": 10460}, {"loss": 0.8109, "grad_norm": 1.3772553205490112, "learning_rate": 0.0002, "epoch": 7.57871878393051, "step": 10470}, {"loss": 0.7907, "grad_norm": 1.4591912031173706, "learning_rate": 0.0002, "epoch": 7.5859572927976835, "step": 10480}, {"loss": 0.878, "grad_norm": 1.4248491525650024, "learning_rate": 0.0002, "epoch": 7.593195801664857, "step": 10490}, {"loss": 0.7818, "grad_norm": 1.2663065195083618, "learning_rate": 0.0002, "epoch": 7.60043431053203, "step": 10500}, {"loss": 0.8349, "grad_norm": 1.1095938682556152, "learning_rate": 0.0002, "epoch": 7.607672819399204, "step": 10510}, {"loss": 0.7793, "grad_norm": 1.8462796211242676, "learning_rate": 0.0002, "epoch": 7.614911328266377, "step": 10520}, {"loss": 0.7695, "grad_norm": 1.1936118602752686, "learning_rate": 0.0002, "epoch": 7.62214983713355, "step": 10530}, {"loss": 0.8911, "grad_norm": 1.3520885705947876, "learning_rate": 0.0002, "epoch": 7.629388346000724, "step": 10540}, {"loss": 0.8589, "grad_norm": 1.2915338277816772, "learning_rate": 0.0002, "epoch": 7.636626854867897, "step": 10550}, {"loss": 0.8932, "grad_norm": 1.125656008720398, "learning_rate": 0.0002, "epoch": 7.64386536373507, "step": 10560}, {"loss": 0.8689, "grad_norm": 1.419791579246521, "learning_rate": 0.0002, "epoch": 7.651103872602244, "step": 10570}, {"loss": 0.8113, "grad_norm": 1.2106866836547852, "learning_rate": 0.0002, "epoch": 7.658342381469417, "step": 10580}, {"loss": 0.7383, "grad_norm": 1.359818458557129, "learning_rate": 0.0002, "epoch": 7.6655808903365905, "step": 10590}, {"loss": 0.8986, "grad_norm": 1.3971713781356812, "learning_rate": 0.0002, "epoch": 7.672819399203764, "step": 10600}, {"loss": 0.942, "grad_norm": 1.287888765335083, "learning_rate": 0.0002, "epoch": 7.680057908070937, "step": 10610}, {"loss": 0.8835, "grad_norm": 0.9856569766998291, "learning_rate": 0.0002, "epoch": 7.687296416938111, "step": 10620}, {"loss": 0.8463, "grad_norm": 1.5403797626495361, "learning_rate": 0.0002, "epoch": 7.694534925805284, "step": 10630}, {"loss": 0.7888, "grad_norm": 1.204551339149475, "learning_rate": 0.0002, "epoch": 7.701773434672457, "step": 10640}, {"loss": 0.7582, "grad_norm": 1.3801014423370361, "learning_rate": 0.0002, "epoch": 7.709011943539631, "step": 10650}, {"loss": 0.8715, "grad_norm": 1.3335949182510376, "learning_rate": 0.0002, "epoch": 7.716250452406804, "step": 10660}, {"loss": 0.9412, "grad_norm": 1.1740102767944336, "learning_rate": 0.0002, "epoch": 7.723488961273977, "step": 10670}, {"loss": 0.832, "grad_norm": 1.1663082838058472, "learning_rate": 0.0002, "epoch": 7.730727470141151, "step": 10680}, {"loss": 0.9191, "grad_norm": 1.3149393796920776, "learning_rate": 0.0002, "epoch": 7.737965979008324, "step": 10690}, {"loss": 0.8697, "grad_norm": 1.3169108629226685, "learning_rate": 0.0002, "epoch": 7.7452044878754975, "step": 10700}, {"loss": 0.8623, "grad_norm": 1.4583594799041748, "learning_rate": 0.0002, "epoch": 7.752442996742671, "step": 10710}, {"loss": 0.75, "grad_norm": 1.1077126264572144, "learning_rate": 0.0002, "epoch": 7.759681505609844, "step": 10720}, {"loss": 0.9109, "grad_norm": 1.5475820302963257, "learning_rate": 0.0002, "epoch": 7.766920014477018, "step": 10730}, {"loss": 0.8153, "grad_norm": 1.2319282293319702, "learning_rate": 0.0002, "epoch": 7.774158523344191, "step": 10740}, {"loss": 0.8324, "grad_norm": 0.9938047528266907, "learning_rate": 0.0002, "epoch": 7.781397032211364, "step": 10750}, {"loss": 0.9356, "grad_norm": 1.2498962879180908, "learning_rate": 0.0002, "epoch": 7.788635541078538, "step": 10760}, {"loss": 0.8744, "grad_norm": 2.192695379257202, "learning_rate": 0.0002, "epoch": 7.795874049945711, "step": 10770}, {"loss": 0.9072, "grad_norm": 1.1851826906204224, "learning_rate": 0.0002, "epoch": 7.8031125588128845, "step": 10780}, {"loss": 0.8933, "grad_norm": 1.0591034889221191, "learning_rate": 0.0002, "epoch": 7.810351067680058, "step": 10790}, {"loss": 0.8298, "grad_norm": 0.9350354671478271, "learning_rate": 0.0002, "epoch": 7.817589576547231, "step": 10800}, {"loss": 0.8701, "grad_norm": 1.5080015659332275, "learning_rate": 0.0002, "epoch": 7.8248280854144046, "step": 10810}, {"loss": 0.9315, "grad_norm": 2.136425495147705, "learning_rate": 0.0002, "epoch": 7.832066594281578, "step": 10820}, {"loss": 0.8204, "grad_norm": 1.5646673440933228, "learning_rate": 0.0002, "epoch": 7.839305103148751, "step": 10830}, {"loss": 0.7794, "grad_norm": 1.381301999092102, "learning_rate": 0.0002, "epoch": 7.846543612015925, "step": 10840}, {"loss": 0.9088, "grad_norm": 1.9323210716247559, "learning_rate": 0.0002, "epoch": 7.853782120883098, "step": 10850}, {"loss": 0.8854, "grad_norm": 1.020809531211853, "learning_rate": 0.0002, "epoch": 7.861020629750271, "step": 10860}, {"loss": 0.8144, "grad_norm": 1.1488909721374512, "learning_rate": 0.0002, "epoch": 7.868259138617445, "step": 10870}, {"loss": 0.883, "grad_norm": 1.4068763256072998, "learning_rate": 0.0002, "epoch": 7.875497647484618, "step": 10880}, {"loss": 0.7431, "grad_norm": 0.9201020002365112, "learning_rate": 0.0002, "epoch": 7.8827361563517915, "step": 10890}, {"loss": 0.8358, "grad_norm": 1.3163132667541504, "learning_rate": 0.0002, "epoch": 7.889974665218965, "step": 10900}, {"loss": 0.9908, "grad_norm": 1.65055513381958, "learning_rate": 0.0002, "epoch": 7.897213174086138, "step": 10910}, {"loss": 0.9883, "grad_norm": 1.1068748235702515, "learning_rate": 0.0002, "epoch": 7.904451682953312, "step": 10920}, {"loss": 0.8553, "grad_norm": 1.8744254112243652, "learning_rate": 0.0002, "epoch": 7.911690191820485, "step": 10930}, {"loss": 0.8969, "grad_norm": 1.3279157876968384, "learning_rate": 0.0002, "epoch": 7.918928700687658, "step": 10940}, {"loss": 0.8642, "grad_norm": 1.0890769958496094, "learning_rate": 0.0002, "epoch": 7.926167209554832, "step": 10950}, {"loss": 0.8742, "grad_norm": 1.3951836824417114, "learning_rate": 0.0002, "epoch": 7.933405718422005, "step": 10960}, {"loss": 0.895, "grad_norm": 1.2761356830596924, "learning_rate": 0.0002, "epoch": 7.940644227289178, "step": 10970}, {"loss": 0.8303, "grad_norm": 1.2073882818222046, "learning_rate": 0.0002, "epoch": 7.947882736156352, "step": 10980}, {"loss": 0.8675, "grad_norm": 1.1899374723434448, "learning_rate": 0.0002, "epoch": 7.955121245023525, "step": 10990}, {"loss": 0.8373, "grad_norm": 1.3041194677352905, "learning_rate": 0.0002, "epoch": 7.9623597538906985, "step": 11000}, {"loss": 0.8207, "grad_norm": 1.3564491271972656, "learning_rate": 0.0002, "epoch": 7.969598262757872, "step": 11010}, {"loss": 0.895, "grad_norm": 1.1411082744598389, "learning_rate": 0.0002, "epoch": 7.976836771625045, "step": 11020}, {"loss": 0.811, "grad_norm": 1.1378493309020996, "learning_rate": 0.0002, "epoch": 7.984075280492219, "step": 11030}, {"loss": 0.8257, "grad_norm": 1.5072855949401855, "learning_rate": 0.0002, "epoch": 7.991313789359392, "step": 11040}]}