diff --git a/.gitattributes b/.gitattributes index b35593da2da780c0c3ac827a66630d73015a0af8..0c46ef5d938b208667e5cbe7a3abd1b269b94f2c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -884,3 +884,12 @@ gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora- gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-8289/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/checkpoint-9670/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9eca8e044620be3ce058a1783f5a421a1736fd6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e67bfcb11314afe23d52fe35c156ccfcb48ab2e93bd10c7abf7c1376d9b958d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04756492035a96f74ab026f558aac7c7c1c3752d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a766b5a8cb310431a6373ef78d5c7816e3b99ec1bf6134964a443595e21c12 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa865bfb191dd0da150fdcd2e9df1dfb7c750bff --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cdbc159bd3df1d681732bb87a4f18efebbbc6ee634b52296157df759110522a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c3441bb7d6925db65a31929aecd1a111b60f5d8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b9f092d336f37a81d1ef105078f4a0c705a9b85b218a3c1b473f24caa0e8e0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6d6ca61a8d538d11e10a20ed03a220172d4bf5f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894f660531cb1239606766f6559fac4e84c1220ce4349a19c8858d608371b1b7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1ca1fe0503185b38b78622e51fce93e6bf730e5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/trainer_state.json @@ -0,0 +1,867 @@ +{ + "best_metric": 1.8077726364135742, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188", + "epoch": 0.9995793016407236, + "eval_steps": 10, + "global_step": 1188, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.500111884071731e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9eca8e044620be3ce058a1783f5a421a1736fd6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e67bfcb11314afe23d52fe35c156ccfcb48ab2e93bd10c7abf7c1376d9b958d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b692e373cc096a7cdb6ea39962d7ac1d446bfdb1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0af334a78bf7194c5014e3070ce3111162e24cd28d906ba20333107b38b9932 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f95d736f5723087a478fbcbb62288c39adc449a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c34d90a4f85cd0c9aa48d0b81fd5edad3e1427338f0942568ff10c9f0e75f8 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d6f390c6996e4c0fd28834642189b9c340eb0b1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38384347c3fbdf548f3429703d7a725557ee363e767816165662ae294c9e82ec +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d106069a7038427e703fd55a559f0be01bf4f2b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/trainer_state.json @@ -0,0 +1,1708 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2377, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1000223768143462e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f4f410418768a7b413f16514274c7bad022e7c28 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3da09c9b7c982009a6041e24fa2892e5e37d74df4f04aaeb442193dbdfdda14 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ca02432285bc8788092b94c9075679d552e1e09 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7fdfc4dca404e4173ddda2b81c3a0777828734ce1f8a4fb95728878ddd8e176 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a9c8e92ef5359d5c5fe05b86784ee2d1aa70e08 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7d31c1acccbc5ba723c396072530ed85853ea77c94cb42addb003ff299ebdd9 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddb4edffd36393473f879801e11c72eab1e7bcc7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a1123053227ea54b69507b378d69d72b7ad22b2d8e1e92614975b757ee988f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ad93920afd45593d9d92e25c6612506f4212e9e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/trainer_state.json @@ -0,0 +1,2549 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 2.9995793016407237, + "eval_steps": 10, + "global_step": 3565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6500335652215194e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-3565/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bcc66ea61bc15269026188d40da0c63fac1d2464 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34bb60dd6a14b909b3a4b1b2e39bc03628c58c8097adcf5996e5b1e86852650a +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..913c4afa58476db9fb005acc4db228143df926fa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0b5b8677319130cb0db8505e3bf2423e75a6e4db8c1ed002e2f856367d3c04 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..86a5ff8ddb03f16119c5416749514cd1bc4a9c5d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae11d62103c7b8451a989acb28df2f459fd59d22bb21b6a1b17d66ae0d502f6 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d54a6cd5fade505ebe9ce4e9ee0babc5788f107 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe349c3c5c33e28a25a0a294d49d09b87b69bc75373770eaa85bbc11de7321c +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2736d78056fd97b7353eb82cf68e7122741f214a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/trainer_state.json @@ -0,0 +1,3390 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 4754, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + }, + { + "epoch": 3.003786285233488, + "grad_norm": 0.4523724615573883, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3570 + }, + { + "epoch": 3.0122002524190155, + "grad_norm": 0.5261380076408386, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3580 + }, + { + "epoch": 3.0206142196045436, + "grad_norm": 0.48664888739585876, + "learning_rate": 0.0002, + "loss": 1.4946, + "step": 3590 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 0.5070882439613342, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 3600 + }, + { + "epoch": 3.0374421539755994, + "grad_norm": 0.5816011428833008, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 3610 + }, + { + "epoch": 3.0458561211611275, + "grad_norm": 0.6610211730003357, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3620 + }, + { + "epoch": 3.0542700883466556, + "grad_norm": 0.5257703065872192, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3630 + }, + { + "epoch": 3.0626840555321833, + "grad_norm": 0.5574390888214111, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 3640 + }, + { + "epoch": 3.0710980227177114, + "grad_norm": 0.5682297348976135, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3650 + }, + { + "epoch": 3.0795119899032395, + "grad_norm": 0.5798383355140686, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3660 + }, + { + "epoch": 3.087925957088767, + "grad_norm": 0.5458289980888367, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 3670 + }, + { + "epoch": 3.0963399242742953, + "grad_norm": 0.5599102973937988, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 3680 + }, + { + "epoch": 3.1047538914598234, + "grad_norm": 0.5023021697998047, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 3690 + }, + { + "epoch": 3.113167858645351, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3700 + }, + { + "epoch": 3.121581825830879, + "grad_norm": 0.5760458707809448, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 3710 + }, + { + "epoch": 3.1299957930164073, + "grad_norm": 0.6018968224525452, + "learning_rate": 0.0002, + "loss": 1.4789, + "step": 3720 + }, + { + "epoch": 3.1384097602019354, + "grad_norm": 0.5767101049423218, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3730 + }, + { + "epoch": 3.146823727387463, + "grad_norm": 0.5333963632583618, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 3740 + }, + { + "epoch": 3.155237694572991, + "grad_norm": 0.5918396711349487, + "learning_rate": 0.0002, + "loss": 1.4812, + "step": 3750 + }, + { + "epoch": 3.1636516617585193, + "grad_norm": 0.5931203365325928, + "learning_rate": 0.0002, + "loss": 1.4618, + "step": 3760 + }, + { + "epoch": 3.172065628944047, + "grad_norm": 0.6562168598175049, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 3770 + }, + { + "epoch": 3.180479596129575, + "grad_norm": 0.5820156335830688, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 3780 + }, + { + "epoch": 3.188893563315103, + "grad_norm": 0.5784737467765808, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 3790 + }, + { + "epoch": 3.197307530500631, + "grad_norm": 0.5506529808044434, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 3800 + }, + { + "epoch": 3.205721497686159, + "grad_norm": 0.6101595163345337, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3810 + }, + { + "epoch": 3.214135464871687, + "grad_norm": 0.5597806572914124, + "learning_rate": 0.0002, + "loss": 1.5185, + "step": 3820 + }, + { + "epoch": 3.222549432057215, + "grad_norm": 0.5641011595726013, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 3830 + }, + { + "epoch": 3.230963399242743, + "grad_norm": 0.5892080068588257, + "learning_rate": 0.0002, + "loss": 1.4702, + "step": 3840 + }, + { + "epoch": 3.239377366428271, + "grad_norm": 0.6034760475158691, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 3850 + }, + { + "epoch": 3.247791333613799, + "grad_norm": 0.5112439393997192, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3860 + }, + { + "epoch": 3.256205300799327, + "grad_norm": 0.56565922498703, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 3870 + }, + { + "epoch": 3.264619267984855, + "grad_norm": 0.6155247092247009, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 3880 + }, + { + "epoch": 3.273033235170383, + "grad_norm": 0.6064623594284058, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 3890 + }, + { + "epoch": 3.2814472023559107, + "grad_norm": 0.6313768029212952, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 3900 + }, + { + "epoch": 3.289861169541439, + "grad_norm": 0.5903939008712769, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3910 + }, + { + "epoch": 3.298275136726967, + "grad_norm": 0.5770667195320129, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3920 + }, + { + "epoch": 3.3066891039124946, + "grad_norm": 0.5785196423530579, + "learning_rate": 0.0002, + "loss": 1.5159, + "step": 3930 + }, + { + "epoch": 3.3151030710980227, + "grad_norm": 0.6468310356140137, + "learning_rate": 0.0002, + "loss": 1.5277, + "step": 3940 + }, + { + "epoch": 3.323517038283551, + "grad_norm": 0.6200279593467712, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3950 + }, + { + "epoch": 3.3319310054690785, + "grad_norm": 0.5779302716255188, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3960 + }, + { + "epoch": 3.3403449726546066, + "grad_norm": 0.5463796854019165, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 3970 + }, + { + "epoch": 3.3487589398401347, + "grad_norm": 0.6117855906486511, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3980 + }, + { + "epoch": 3.357172907025663, + "grad_norm": 0.5554766058921814, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3990 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 0.6012870073318481, + "learning_rate": 0.0002, + "loss": 1.5004, + "step": 4000 + }, + { + "epoch": 3.3740008413967186, + "grad_norm": 0.5443974137306213, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 4010 + }, + { + "epoch": 3.3824148085822463, + "grad_norm": 0.6636057496070862, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4020 + }, + { + "epoch": 3.3908287757677744, + "grad_norm": 0.5801246166229248, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 4030 + }, + { + "epoch": 3.3992427429533025, + "grad_norm": 0.5668839812278748, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4040 + }, + { + "epoch": 3.4076567101388306, + "grad_norm": 0.7763481736183167, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 4050 + }, + { + "epoch": 3.4160706773243583, + "grad_norm": 0.6675992608070374, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4060 + }, + { + "epoch": 3.4244846445098864, + "grad_norm": 0.6290077567100525, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 4070 + }, + { + "epoch": 3.4328986116954145, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4080 + }, + { + "epoch": 3.441312578880942, + "grad_norm": 0.6237877607345581, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4090 + }, + { + "epoch": 3.4497265460664703, + "grad_norm": 0.5343508124351501, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4100 + }, + { + "epoch": 3.4581405132519984, + "grad_norm": 0.6817412972450256, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 4110 + }, + { + "epoch": 3.466554480437526, + "grad_norm": 0.7115170359611511, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 4120 + }, + { + "epoch": 3.4749684476230542, + "grad_norm": 0.6127332448959351, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4130 + }, + { + "epoch": 3.4833824148085824, + "grad_norm": 0.5745994448661804, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4140 + }, + { + "epoch": 3.49179638199411, + "grad_norm": 0.6248795390129089, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 3.500210349179638, + "grad_norm": 0.5821124911308289, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 4160 + }, + { + "epoch": 3.5086243163651663, + "grad_norm": 0.561416506767273, + "learning_rate": 0.0002, + "loss": 1.4937, + "step": 4170 + }, + { + "epoch": 3.5170382835506944, + "grad_norm": 0.5848962664604187, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4180 + }, + { + "epoch": 3.525452250736222, + "grad_norm": 0.5335569977760315, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4190 + }, + { + "epoch": 3.53386621792175, + "grad_norm": 0.547964870929718, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 4200 + }, + { + "epoch": 3.542280185107278, + "grad_norm": 0.6157727241516113, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 4210 + }, + { + "epoch": 3.550694152292806, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4220 + }, + { + "epoch": 3.559108119478334, + "grad_norm": 0.5844616293907166, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4230 + }, + { + "epoch": 3.567522086663862, + "grad_norm": 0.7104926109313965, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4240 + }, + { + "epoch": 3.57593605384939, + "grad_norm": 0.5055213570594788, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4250 + }, + { + "epoch": 3.584350021034918, + "grad_norm": 0.611676812171936, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 4260 + }, + { + "epoch": 3.592763988220446, + "grad_norm": 0.6326440572738647, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4270 + }, + { + "epoch": 3.6011779554059737, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4280 + }, + { + "epoch": 3.609591922591502, + "grad_norm": 0.5691978931427002, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 4290 + }, + { + "epoch": 3.61800588977703, + "grad_norm": 0.6071329116821289, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 4300 + }, + { + "epoch": 3.626419856962558, + "grad_norm": 0.606573224067688, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 4310 + }, + { + "epoch": 3.6348338241480858, + "grad_norm": 0.5515419244766235, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 4320 + }, + { + "epoch": 3.643247791333614, + "grad_norm": 0.5964660048484802, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4330 + }, + { + "epoch": 3.6516617585191415, + "grad_norm": 0.5774146914482117, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 4340 + }, + { + "epoch": 3.6600757257046697, + "grad_norm": 0.5732731223106384, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 4350 + }, + { + "epoch": 3.6684896928901978, + "grad_norm": 0.7354163527488708, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 3.676903660075726, + "grad_norm": 0.6220902800559998, + "learning_rate": 0.0002, + "loss": 1.5225, + "step": 4370 + }, + { + "epoch": 3.6853176272612536, + "grad_norm": 0.6053991317749023, + "learning_rate": 0.0002, + "loss": 1.4838, + "step": 4380 + }, + { + "epoch": 3.6937315944467817, + "grad_norm": 0.67010897397995, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4390 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 0.6139186024665833, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4400 + }, + { + "epoch": 3.7105595288178375, + "grad_norm": 0.5433071851730347, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 4410 + }, + { + "epoch": 3.7189734960033656, + "grad_norm": 0.5453870296478271, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 4420 + }, + { + "epoch": 3.7273874631888937, + "grad_norm": 0.6401727199554443, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 4430 + }, + { + "epoch": 3.735801430374422, + "grad_norm": 0.6049367189407349, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4440 + }, + { + "epoch": 3.7442153975599495, + "grad_norm": 0.5740529298782349, + "learning_rate": 0.0002, + "loss": 1.5268, + "step": 4450 + }, + { + "epoch": 3.7526293647454776, + "grad_norm": 0.6521880626678467, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 4460 + }, + { + "epoch": 3.7610433319310053, + "grad_norm": 0.7096368074417114, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4470 + }, + { + "epoch": 3.7694572991165334, + "grad_norm": 0.5886474251747131, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 4480 + }, + { + "epoch": 3.7778712663020615, + "grad_norm": 0.5821043252944946, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4490 + }, + { + "epoch": 3.7862852334875896, + "grad_norm": 0.628892183303833, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4500 + }, + { + "epoch": 3.7946992006731173, + "grad_norm": 0.5962669849395752, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4510 + }, + { + "epoch": 3.8031131678586454, + "grad_norm": 0.6635549068450928, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4520 + }, + { + "epoch": 3.811527135044173, + "grad_norm": 0.6010760068893433, + "learning_rate": 0.0002, + "loss": 1.5058, + "step": 4530 + }, + { + "epoch": 3.819941102229701, + "grad_norm": 0.6322658658027649, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 4540 + }, + { + "epoch": 3.8283550694152293, + "grad_norm": 0.5893137454986572, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 4550 + }, + { + "epoch": 3.8367690366007574, + "grad_norm": 0.7829602360725403, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 4560 + }, + { + "epoch": 3.845183003786285, + "grad_norm": 0.6190396547317505, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4570 + }, + { + "epoch": 3.853596970971813, + "grad_norm": 0.6662813425064087, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 4580 + }, + { + "epoch": 3.8620109381573413, + "grad_norm": 0.5809855461120605, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4590 + }, + { + "epoch": 3.870424905342869, + "grad_norm": 0.5779069662094116, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4600 + }, + { + "epoch": 3.878838872528397, + "grad_norm": 0.5603038668632507, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4610 + }, + { + "epoch": 3.887252839713925, + "grad_norm": 0.6274181008338928, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 4620 + }, + { + "epoch": 3.8956668068994533, + "grad_norm": 0.6810959577560425, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 4630 + }, + { + "epoch": 3.904080774084981, + "grad_norm": 0.5647315979003906, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 4640 + }, + { + "epoch": 3.912494741270509, + "grad_norm": 0.6830295324325562, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4650 + }, + { + "epoch": 3.920908708456037, + "grad_norm": 0.652565598487854, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 4660 + }, + { + "epoch": 3.929322675641565, + "grad_norm": 0.5806284546852112, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 4670 + }, + { + "epoch": 3.937736642827093, + "grad_norm": 0.6825073957443237, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4680 + }, + { + "epoch": 3.946150610012621, + "grad_norm": 0.6149451732635498, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 4690 + }, + { + "epoch": 3.954564577198149, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4700 + }, + { + "epoch": 3.962978544383677, + "grad_norm": 0.6239011883735657, + "learning_rate": 0.0002, + "loss": 1.4897, + "step": 4710 + }, + { + "epoch": 3.971392511569205, + "grad_norm": 0.6485443115234375, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4720 + }, + { + "epoch": 3.9798064787547327, + "grad_norm": 0.6449228525161743, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4730 + }, + { + "epoch": 3.988220445940261, + "grad_norm": 0.6526407599449158, + "learning_rate": 0.0002, + "loss": 1.5087, + "step": 4740 + }, + { + "epoch": 3.996634413125789, + "grad_norm": 0.6277706027030945, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4750 + }, + { + "epoch": 4.0, + "eval_loss": 1.871641755104065, + "eval_runtime": 37.9637, + "eval_samples_per_second": 13.566, + "eval_steps_per_second": 1.712, + "step": 4754 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2000447536286925e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-4754/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2703957f8d94a723eab1fe0b0ddbf31cb25ed6ec --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6018665777bb34b616cd34fa662867cfff157e84dc6d90c38e4e02be569b37a5 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..04de002f07da9be69e1c8ba2e5f8e331e02f98b4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0441013b17afbf1cf92a2b1afa32eb7d2412ef831ed1b830a1dcee519ff7a238 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..55f0cc2def4211824a3a8e06f82b1a470ad9dafe --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36eb29aabc5fe6416332a206ea0c4dbeb34353f5b7eaf5a21c684c91e17bf40 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..55843a93ad559b4be7fa0c6d8fa39bfe1db40228 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95a8659297b5b75e90aa56fc1ddf4df2316d927d2ed84dbcf2a6a29fc7ad9c1 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..77ea3a9b7f5d93e668e7c59a41c648dae57eda5f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/trainer_state.json @@ -0,0 +1,4231 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 4.999579301640724, + "eval_steps": 10, + "global_step": 5942, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + }, + { + "epoch": 3.003786285233488, + "grad_norm": 0.4523724615573883, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3570 + }, + { + "epoch": 3.0122002524190155, + "grad_norm": 0.5261380076408386, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3580 + }, + { + "epoch": 3.0206142196045436, + "grad_norm": 0.48664888739585876, + "learning_rate": 0.0002, + "loss": 1.4946, + "step": 3590 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 0.5070882439613342, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 3600 + }, + { + "epoch": 3.0374421539755994, + "grad_norm": 0.5816011428833008, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 3610 + }, + { + "epoch": 3.0458561211611275, + "grad_norm": 0.6610211730003357, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3620 + }, + { + "epoch": 3.0542700883466556, + "grad_norm": 0.5257703065872192, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3630 + }, + { + "epoch": 3.0626840555321833, + "grad_norm": 0.5574390888214111, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 3640 + }, + { + "epoch": 3.0710980227177114, + "grad_norm": 0.5682297348976135, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3650 + }, + { + "epoch": 3.0795119899032395, + "grad_norm": 0.5798383355140686, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3660 + }, + { + "epoch": 3.087925957088767, + "grad_norm": 0.5458289980888367, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 3670 + }, + { + "epoch": 3.0963399242742953, + "grad_norm": 0.5599102973937988, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 3680 + }, + { + "epoch": 3.1047538914598234, + "grad_norm": 0.5023021697998047, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 3690 + }, + { + "epoch": 3.113167858645351, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3700 + }, + { + "epoch": 3.121581825830879, + "grad_norm": 0.5760458707809448, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 3710 + }, + { + "epoch": 3.1299957930164073, + "grad_norm": 0.6018968224525452, + "learning_rate": 0.0002, + "loss": 1.4789, + "step": 3720 + }, + { + "epoch": 3.1384097602019354, + "grad_norm": 0.5767101049423218, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3730 + }, + { + "epoch": 3.146823727387463, + "grad_norm": 0.5333963632583618, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 3740 + }, + { + "epoch": 3.155237694572991, + "grad_norm": 0.5918396711349487, + "learning_rate": 0.0002, + "loss": 1.4812, + "step": 3750 + }, + { + "epoch": 3.1636516617585193, + "grad_norm": 0.5931203365325928, + "learning_rate": 0.0002, + "loss": 1.4618, + "step": 3760 + }, + { + "epoch": 3.172065628944047, + "grad_norm": 0.6562168598175049, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 3770 + }, + { + "epoch": 3.180479596129575, + "grad_norm": 0.5820156335830688, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 3780 + }, + { + "epoch": 3.188893563315103, + "grad_norm": 0.5784737467765808, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 3790 + }, + { + "epoch": 3.197307530500631, + "grad_norm": 0.5506529808044434, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 3800 + }, + { + "epoch": 3.205721497686159, + "grad_norm": 0.6101595163345337, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3810 + }, + { + "epoch": 3.214135464871687, + "grad_norm": 0.5597806572914124, + "learning_rate": 0.0002, + "loss": 1.5185, + "step": 3820 + }, + { + "epoch": 3.222549432057215, + "grad_norm": 0.5641011595726013, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 3830 + }, + { + "epoch": 3.230963399242743, + "grad_norm": 0.5892080068588257, + "learning_rate": 0.0002, + "loss": 1.4702, + "step": 3840 + }, + { + "epoch": 3.239377366428271, + "grad_norm": 0.6034760475158691, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 3850 + }, + { + "epoch": 3.247791333613799, + "grad_norm": 0.5112439393997192, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3860 + }, + { + "epoch": 3.256205300799327, + "grad_norm": 0.56565922498703, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 3870 + }, + { + "epoch": 3.264619267984855, + "grad_norm": 0.6155247092247009, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 3880 + }, + { + "epoch": 3.273033235170383, + "grad_norm": 0.6064623594284058, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 3890 + }, + { + "epoch": 3.2814472023559107, + "grad_norm": 0.6313768029212952, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 3900 + }, + { + "epoch": 3.289861169541439, + "grad_norm": 0.5903939008712769, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3910 + }, + { + "epoch": 3.298275136726967, + "grad_norm": 0.5770667195320129, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3920 + }, + { + "epoch": 3.3066891039124946, + "grad_norm": 0.5785196423530579, + "learning_rate": 0.0002, + "loss": 1.5159, + "step": 3930 + }, + { + "epoch": 3.3151030710980227, + "grad_norm": 0.6468310356140137, + "learning_rate": 0.0002, + "loss": 1.5277, + "step": 3940 + }, + { + "epoch": 3.323517038283551, + "grad_norm": 0.6200279593467712, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3950 + }, + { + "epoch": 3.3319310054690785, + "grad_norm": 0.5779302716255188, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3960 + }, + { + "epoch": 3.3403449726546066, + "grad_norm": 0.5463796854019165, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 3970 + }, + { + "epoch": 3.3487589398401347, + "grad_norm": 0.6117855906486511, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3980 + }, + { + "epoch": 3.357172907025663, + "grad_norm": 0.5554766058921814, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3990 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 0.6012870073318481, + "learning_rate": 0.0002, + "loss": 1.5004, + "step": 4000 + }, + { + "epoch": 3.3740008413967186, + "grad_norm": 0.5443974137306213, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 4010 + }, + { + "epoch": 3.3824148085822463, + "grad_norm": 0.6636057496070862, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4020 + }, + { + "epoch": 3.3908287757677744, + "grad_norm": 0.5801246166229248, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 4030 + }, + { + "epoch": 3.3992427429533025, + "grad_norm": 0.5668839812278748, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4040 + }, + { + "epoch": 3.4076567101388306, + "grad_norm": 0.7763481736183167, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 4050 + }, + { + "epoch": 3.4160706773243583, + "grad_norm": 0.6675992608070374, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4060 + }, + { + "epoch": 3.4244846445098864, + "grad_norm": 0.6290077567100525, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 4070 + }, + { + "epoch": 3.4328986116954145, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4080 + }, + { + "epoch": 3.441312578880942, + "grad_norm": 0.6237877607345581, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4090 + }, + { + "epoch": 3.4497265460664703, + "grad_norm": 0.5343508124351501, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4100 + }, + { + "epoch": 3.4581405132519984, + "grad_norm": 0.6817412972450256, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 4110 + }, + { + "epoch": 3.466554480437526, + "grad_norm": 0.7115170359611511, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 4120 + }, + { + "epoch": 3.4749684476230542, + "grad_norm": 0.6127332448959351, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4130 + }, + { + "epoch": 3.4833824148085824, + "grad_norm": 0.5745994448661804, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4140 + }, + { + "epoch": 3.49179638199411, + "grad_norm": 0.6248795390129089, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 3.500210349179638, + "grad_norm": 0.5821124911308289, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 4160 + }, + { + "epoch": 3.5086243163651663, + "grad_norm": 0.561416506767273, + "learning_rate": 0.0002, + "loss": 1.4937, + "step": 4170 + }, + { + "epoch": 3.5170382835506944, + "grad_norm": 0.5848962664604187, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4180 + }, + { + "epoch": 3.525452250736222, + "grad_norm": 0.5335569977760315, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4190 + }, + { + "epoch": 3.53386621792175, + "grad_norm": 0.547964870929718, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 4200 + }, + { + "epoch": 3.542280185107278, + "grad_norm": 0.6157727241516113, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 4210 + }, + { + "epoch": 3.550694152292806, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4220 + }, + { + "epoch": 3.559108119478334, + "grad_norm": 0.5844616293907166, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4230 + }, + { + "epoch": 3.567522086663862, + "grad_norm": 0.7104926109313965, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4240 + }, + { + "epoch": 3.57593605384939, + "grad_norm": 0.5055213570594788, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4250 + }, + { + "epoch": 3.584350021034918, + "grad_norm": 0.611676812171936, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 4260 + }, + { + "epoch": 3.592763988220446, + "grad_norm": 0.6326440572738647, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4270 + }, + { + "epoch": 3.6011779554059737, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4280 + }, + { + "epoch": 3.609591922591502, + "grad_norm": 0.5691978931427002, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 4290 + }, + { + "epoch": 3.61800588977703, + "grad_norm": 0.6071329116821289, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 4300 + }, + { + "epoch": 3.626419856962558, + "grad_norm": 0.606573224067688, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 4310 + }, + { + "epoch": 3.6348338241480858, + "grad_norm": 0.5515419244766235, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 4320 + }, + { + "epoch": 3.643247791333614, + "grad_norm": 0.5964660048484802, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4330 + }, + { + "epoch": 3.6516617585191415, + "grad_norm": 0.5774146914482117, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 4340 + }, + { + "epoch": 3.6600757257046697, + "grad_norm": 0.5732731223106384, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 4350 + }, + { + "epoch": 3.6684896928901978, + "grad_norm": 0.7354163527488708, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 3.676903660075726, + "grad_norm": 0.6220902800559998, + "learning_rate": 0.0002, + "loss": 1.5225, + "step": 4370 + }, + { + "epoch": 3.6853176272612536, + "grad_norm": 0.6053991317749023, + "learning_rate": 0.0002, + "loss": 1.4838, + "step": 4380 + }, + { + "epoch": 3.6937315944467817, + "grad_norm": 0.67010897397995, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4390 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 0.6139186024665833, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4400 + }, + { + "epoch": 3.7105595288178375, + "grad_norm": 0.5433071851730347, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 4410 + }, + { + "epoch": 3.7189734960033656, + "grad_norm": 0.5453870296478271, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 4420 + }, + { + "epoch": 3.7273874631888937, + "grad_norm": 0.6401727199554443, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 4430 + }, + { + "epoch": 3.735801430374422, + "grad_norm": 0.6049367189407349, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4440 + }, + { + "epoch": 3.7442153975599495, + "grad_norm": 0.5740529298782349, + "learning_rate": 0.0002, + "loss": 1.5268, + "step": 4450 + }, + { + "epoch": 3.7526293647454776, + "grad_norm": 0.6521880626678467, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 4460 + }, + { + "epoch": 3.7610433319310053, + "grad_norm": 0.7096368074417114, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4470 + }, + { + "epoch": 3.7694572991165334, + "grad_norm": 0.5886474251747131, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 4480 + }, + { + "epoch": 3.7778712663020615, + "grad_norm": 0.5821043252944946, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4490 + }, + { + "epoch": 3.7862852334875896, + "grad_norm": 0.628892183303833, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4500 + }, + { + "epoch": 3.7946992006731173, + "grad_norm": 0.5962669849395752, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4510 + }, + { + "epoch": 3.8031131678586454, + "grad_norm": 0.6635549068450928, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4520 + }, + { + "epoch": 3.811527135044173, + "grad_norm": 0.6010760068893433, + "learning_rate": 0.0002, + "loss": 1.5058, + "step": 4530 + }, + { + "epoch": 3.819941102229701, + "grad_norm": 0.6322658658027649, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 4540 + }, + { + "epoch": 3.8283550694152293, + "grad_norm": 0.5893137454986572, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 4550 + }, + { + "epoch": 3.8367690366007574, + "grad_norm": 0.7829602360725403, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 4560 + }, + { + "epoch": 3.845183003786285, + "grad_norm": 0.6190396547317505, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4570 + }, + { + "epoch": 3.853596970971813, + "grad_norm": 0.6662813425064087, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 4580 + }, + { + "epoch": 3.8620109381573413, + "grad_norm": 0.5809855461120605, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4590 + }, + { + "epoch": 3.870424905342869, + "grad_norm": 0.5779069662094116, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4600 + }, + { + "epoch": 3.878838872528397, + "grad_norm": 0.5603038668632507, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4610 + }, + { + "epoch": 3.887252839713925, + "grad_norm": 0.6274181008338928, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 4620 + }, + { + "epoch": 3.8956668068994533, + "grad_norm": 0.6810959577560425, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 4630 + }, + { + "epoch": 3.904080774084981, + "grad_norm": 0.5647315979003906, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 4640 + }, + { + "epoch": 3.912494741270509, + "grad_norm": 0.6830295324325562, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4650 + }, + { + "epoch": 3.920908708456037, + "grad_norm": 0.652565598487854, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 4660 + }, + { + "epoch": 3.929322675641565, + "grad_norm": 0.5806284546852112, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 4670 + }, + { + "epoch": 3.937736642827093, + "grad_norm": 0.6825073957443237, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4680 + }, + { + "epoch": 3.946150610012621, + "grad_norm": 0.6149451732635498, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 4690 + }, + { + "epoch": 3.954564577198149, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4700 + }, + { + "epoch": 3.962978544383677, + "grad_norm": 0.6239011883735657, + "learning_rate": 0.0002, + "loss": 1.4897, + "step": 4710 + }, + { + "epoch": 3.971392511569205, + "grad_norm": 0.6485443115234375, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4720 + }, + { + "epoch": 3.9798064787547327, + "grad_norm": 0.6449228525161743, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4730 + }, + { + "epoch": 3.988220445940261, + "grad_norm": 0.6526407599449158, + "learning_rate": 0.0002, + "loss": 1.5087, + "step": 4740 + }, + { + "epoch": 3.996634413125789, + "grad_norm": 0.6277706027030945, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4750 + }, + { + "epoch": 4.0, + "eval_loss": 1.871641755104065, + "eval_runtime": 37.9637, + "eval_samples_per_second": 13.566, + "eval_steps_per_second": 1.712, + "step": 4754 + }, + { + "epoch": 4.005048380311317, + "grad_norm": 0.6994837522506714, + "learning_rate": 0.0002, + "loss": 1.4744, + "step": 4760 + }, + { + "epoch": 4.013462347496845, + "grad_norm": 0.8728373050689697, + "learning_rate": 0.0002, + "loss": 1.4433, + "step": 4770 + }, + { + "epoch": 4.021876314682372, + "grad_norm": 0.688679575920105, + "learning_rate": 0.0002, + "loss": 1.3329, + "step": 4780 + }, + { + "epoch": 4.0302902818679005, + "grad_norm": 0.6313387155532837, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 4790 + }, + { + "epoch": 4.038704249053429, + "grad_norm": 0.6577984690666199, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 4800 + }, + { + "epoch": 4.047118216238957, + "grad_norm": 0.7938185930252075, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4810 + }, + { + "epoch": 4.055532183424485, + "grad_norm": 0.760399580001831, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4820 + }, + { + "epoch": 4.063946150610013, + "grad_norm": 0.7329602241516113, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 4830 + }, + { + "epoch": 4.07236011779554, + "grad_norm": 0.7778576016426086, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 4840 + }, + { + "epoch": 4.080774084981068, + "grad_norm": 0.8235865235328674, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 4850 + }, + { + "epoch": 4.089188052166596, + "grad_norm": 0.7743754386901855, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 4860 + }, + { + "epoch": 4.0976020193521245, + "grad_norm": 0.8145367503166199, + "learning_rate": 0.0002, + "loss": 1.3787, + "step": 4870 + }, + { + "epoch": 4.106015986537653, + "grad_norm": 0.8517307639122009, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 4880 + }, + { + "epoch": 4.114429953723181, + "grad_norm": 0.8208953142166138, + "learning_rate": 0.0002, + "loss": 1.4191, + "step": 4890 + }, + { + "epoch": 4.122843920908709, + "grad_norm": 0.8437790870666504, + "learning_rate": 0.0002, + "loss": 1.3189, + "step": 4900 + }, + { + "epoch": 4.131257888094236, + "grad_norm": 0.716672420501709, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4910 + }, + { + "epoch": 4.139671855279764, + "grad_norm": 0.7656235098838806, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 4920 + }, + { + "epoch": 4.148085822465292, + "grad_norm": 0.7209306955337524, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 4930 + }, + { + "epoch": 4.1564997896508205, + "grad_norm": 0.7731267809867859, + "learning_rate": 0.0002, + "loss": 1.3639, + "step": 4940 + }, + { + "epoch": 4.164913756836349, + "grad_norm": 0.7477553486824036, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 4950 + }, + { + "epoch": 4.173327724021877, + "grad_norm": 0.7372981309890747, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 4960 + }, + { + "epoch": 4.181741691207404, + "grad_norm": 0.6582154035568237, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4970 + }, + { + "epoch": 4.190155658392932, + "grad_norm": 0.7003206610679626, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 4980 + }, + { + "epoch": 4.19856962557846, + "grad_norm": 0.735223650932312, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 4990 + }, + { + "epoch": 4.206983592763988, + "grad_norm": 0.7832302451133728, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 5000 + }, + { + "epoch": 4.215397559949516, + "grad_norm": 0.8819546103477478, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5010 + }, + { + "epoch": 4.2238115271350445, + "grad_norm": 0.9325336813926697, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 5020 + }, + { + "epoch": 4.232225494320572, + "grad_norm": 0.7007517218589783, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 5030 + }, + { + "epoch": 4.2406394615061, + "grad_norm": 0.7118321061134338, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 5040 + }, + { + "epoch": 4.249053428691628, + "grad_norm": 0.6578946709632874, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5050 + }, + { + "epoch": 4.257467395877156, + "grad_norm": 0.9438983798027039, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 5060 + }, + { + "epoch": 4.265881363062684, + "grad_norm": 0.703037679195404, + "learning_rate": 0.0002, + "loss": 1.3868, + "step": 5070 + }, + { + "epoch": 4.274295330248212, + "grad_norm": 0.7286025285720825, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 5080 + }, + { + "epoch": 4.28270929743374, + "grad_norm": 0.750689685344696, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5090 + }, + { + "epoch": 4.291123264619268, + "grad_norm": 0.869753360748291, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 5100 + }, + { + "epoch": 4.299537231804796, + "grad_norm": 0.8712980151176453, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 5110 + }, + { + "epoch": 4.307951198990324, + "grad_norm": 0.690263569355011, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 5120 + }, + { + "epoch": 4.316365166175852, + "grad_norm": 0.7114760279655457, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 5130 + }, + { + "epoch": 4.32477913336138, + "grad_norm": 0.7588112354278564, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5140 + }, + { + "epoch": 4.333193100546908, + "grad_norm": 0.7556202411651611, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 5150 + }, + { + "epoch": 4.341607067732435, + "grad_norm": 0.8357610702514648, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5160 + }, + { + "epoch": 4.3500210349179635, + "grad_norm": 0.8054035902023315, + "learning_rate": 0.0002, + "loss": 1.3348, + "step": 5170 + }, + { + "epoch": 4.358435002103492, + "grad_norm": 0.7637107968330383, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5180 + }, + { + "epoch": 4.36684896928902, + "grad_norm": 0.757481038570404, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 5190 + }, + { + "epoch": 4.375262936474548, + "grad_norm": 0.7185863852500916, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5200 + }, + { + "epoch": 4.383676903660076, + "grad_norm": 0.7326455116271973, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 5210 + }, + { + "epoch": 4.392090870845603, + "grad_norm": 0.7980523109436035, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 5220 + }, + { + "epoch": 4.400504838031131, + "grad_norm": 0.8526999354362488, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 5230 + }, + { + "epoch": 4.4089188052166595, + "grad_norm": 0.7012337446212769, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 5240 + }, + { + "epoch": 4.417332772402188, + "grad_norm": 0.8217827677726746, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 5250 + }, + { + "epoch": 4.425746739587716, + "grad_norm": 0.7141005396842957, + "learning_rate": 0.0002, + "loss": 1.3482, + "step": 5260 + }, + { + "epoch": 4.434160706773244, + "grad_norm": 0.7094302177429199, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 5270 + }, + { + "epoch": 4.442574673958772, + "grad_norm": 0.7234613299369812, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 5280 + }, + { + "epoch": 4.450988641144299, + "grad_norm": 0.7530457973480225, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5290 + }, + { + "epoch": 4.459402608329827, + "grad_norm": 0.7300912141799927, + "learning_rate": 0.0002, + "loss": 1.3944, + "step": 5300 + }, + { + "epoch": 4.467816575515355, + "grad_norm": 0.825443685054779, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 5310 + }, + { + "epoch": 4.4762305427008835, + "grad_norm": 0.7559658885002136, + "learning_rate": 0.0002, + "loss": 1.3648, + "step": 5320 + }, + { + "epoch": 4.484644509886412, + "grad_norm": 0.8817561268806458, + "learning_rate": 0.0002, + "loss": 1.4364, + "step": 5330 + }, + { + "epoch": 4.49305847707194, + "grad_norm": 0.8203575611114502, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 5340 + }, + { + "epoch": 4.501472444257468, + "grad_norm": 0.7677690982818604, + "learning_rate": 0.0002, + "loss": 1.3996, + "step": 5350 + }, + { + "epoch": 4.509886411442995, + "grad_norm": 0.657085120677948, + "learning_rate": 0.0002, + "loss": 1.4142, + "step": 5360 + }, + { + "epoch": 4.518300378628523, + "grad_norm": 0.7939504384994507, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 5370 + }, + { + "epoch": 4.526714345814051, + "grad_norm": 0.6971889138221741, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5380 + }, + { + "epoch": 4.535128312999579, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 5390 + }, + { + "epoch": 4.5435422801851075, + "grad_norm": 0.8504858613014221, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 5400 + }, + { + "epoch": 4.551956247370635, + "grad_norm": 0.9134073853492737, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5410 + }, + { + "epoch": 4.560370214556163, + "grad_norm": 0.7765598893165588, + "learning_rate": 0.0002, + "loss": 1.4375, + "step": 5420 + }, + { + "epoch": 4.568784181741691, + "grad_norm": 0.6991009712219238, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 5430 + }, + { + "epoch": 4.577198148927219, + "grad_norm": 0.8393039107322693, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5440 + }, + { + "epoch": 4.585612116112747, + "grad_norm": 0.7685918211936951, + "learning_rate": 0.0002, + "loss": 1.3976, + "step": 5450 + }, + { + "epoch": 4.594026083298275, + "grad_norm": 0.7135679721832275, + "learning_rate": 0.0002, + "loss": 1.3883, + "step": 5460 + }, + { + "epoch": 4.6024400504838034, + "grad_norm": 0.6728870868682861, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 5470 + }, + { + "epoch": 4.610854017669331, + "grad_norm": 0.7139479517936707, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5480 + }, + { + "epoch": 4.619267984854859, + "grad_norm": 0.8476598858833313, + "learning_rate": 0.0002, + "loss": 1.3498, + "step": 5490 + }, + { + "epoch": 4.627681952040387, + "grad_norm": 0.8034361004829407, + "learning_rate": 0.0002, + "loss": 1.3389, + "step": 5500 + }, + { + "epoch": 4.636095919225915, + "grad_norm": 0.7452183961868286, + "learning_rate": 0.0002, + "loss": 1.4179, + "step": 5510 + }, + { + "epoch": 4.644509886411443, + "grad_norm": 0.8394148945808411, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5520 + }, + { + "epoch": 4.652923853596971, + "grad_norm": 0.7480153441429138, + "learning_rate": 0.0002, + "loss": 1.4561, + "step": 5530 + }, + { + "epoch": 4.661337820782499, + "grad_norm": 0.7781714797019958, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5540 + }, + { + "epoch": 4.669751787968027, + "grad_norm": 1.0058213472366333, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 5550 + }, + { + "epoch": 4.678165755153555, + "grad_norm": 0.7403179407119751, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 5560 + }, + { + "epoch": 4.686579722339083, + "grad_norm": 0.7270476818084717, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 5570 + }, + { + "epoch": 4.694993689524611, + "grad_norm": 0.760877788066864, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5580 + }, + { + "epoch": 4.703407656710139, + "grad_norm": 0.8097004890441895, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5590 + }, + { + "epoch": 4.711821623895667, + "grad_norm": 0.9096523523330688, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 5600 + }, + { + "epoch": 4.720235591081195, + "grad_norm": 0.7262444496154785, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5610 + }, + { + "epoch": 4.7286495582667225, + "grad_norm": 0.8207762837409973, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 5620 + }, + { + "epoch": 4.737063525452251, + "grad_norm": 0.8089601993560791, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 5630 + }, + { + "epoch": 4.745477492637779, + "grad_norm": 0.7609543800354004, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 5640 + }, + { + "epoch": 4.753891459823307, + "grad_norm": 0.7273501753807068, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 5650 + }, + { + "epoch": 4.762305427008835, + "grad_norm": 0.7800219058990479, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5660 + }, + { + "epoch": 4.770719394194362, + "grad_norm": 0.8558377623558044, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 5670 + }, + { + "epoch": 4.77913336137989, + "grad_norm": 0.7131547927856445, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 5680 + }, + { + "epoch": 4.787547328565418, + "grad_norm": 0.7651025056838989, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 5690 + }, + { + "epoch": 4.7959612957509465, + "grad_norm": 0.8129976391792297, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 5700 + }, + { + "epoch": 4.804375262936475, + "grad_norm": 0.8019895553588867, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 5710 + }, + { + "epoch": 4.812789230122003, + "grad_norm": 0.7692018151283264, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 5720 + }, + { + "epoch": 4.821203197307531, + "grad_norm": 0.6893943548202515, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 5730 + }, + { + "epoch": 4.829617164493058, + "grad_norm": 0.6881810426712036, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5740 + }, + { + "epoch": 4.838031131678586, + "grad_norm": 0.7838267683982849, + "learning_rate": 0.0002, + "loss": 1.4775, + "step": 5750 + }, + { + "epoch": 4.846445098864114, + "grad_norm": 0.727799117565155, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5760 + }, + { + "epoch": 4.8548590660496425, + "grad_norm": 0.7458277344703674, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5770 + }, + { + "epoch": 4.863273033235171, + "grad_norm": 0.903802216053009, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 5780 + }, + { + "epoch": 4.871687000420699, + "grad_norm": 0.7983472347259521, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 5790 + }, + { + "epoch": 4.880100967606227, + "grad_norm": 0.6894361972808838, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 5800 + }, + { + "epoch": 4.888514934791754, + "grad_norm": 0.7499409317970276, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 5810 + }, + { + "epoch": 4.896928901977282, + "grad_norm": 0.7362820506095886, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 5820 + }, + { + "epoch": 4.90534286916281, + "grad_norm": 0.8341619968414307, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 5830 + }, + { + "epoch": 4.913756836348338, + "grad_norm": 0.9604470133781433, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5840 + }, + { + "epoch": 4.9221708035338665, + "grad_norm": 0.8916844129562378, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5850 + }, + { + "epoch": 4.930584770719394, + "grad_norm": 0.8519647121429443, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5860 + }, + { + "epoch": 4.938998737904922, + "grad_norm": 0.7946906089782715, + "learning_rate": 0.0002, + "loss": 1.424, + "step": 5870 + }, + { + "epoch": 4.94741270509045, + "grad_norm": 0.7843789458274841, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 5880 + }, + { + "epoch": 4.955826672275978, + "grad_norm": 0.707618772983551, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5890 + }, + { + "epoch": 4.964240639461506, + "grad_norm": 0.7704206109046936, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 5900 + }, + { + "epoch": 4.972654606647034, + "grad_norm": 0.7160256505012512, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5910 + }, + { + "epoch": 4.981068573832562, + "grad_norm": 0.7020420432090759, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 5920 + }, + { + "epoch": 4.98948254101809, + "grad_norm": 0.7576286792755127, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 5930 + }, + { + "epoch": 4.997896508203618, + "grad_norm": 0.8573036789894104, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 5940 + }, + { + "epoch": 4.999579301640724, + "eval_loss": 1.9353811740875244, + "eval_runtime": 37.9208, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 5942 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7500559420358656e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-5942/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ddc54a30306eb450d8a0bab8b59976c957d1e6d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b550c6fb37ebab4c946fa28d7dd8ef81f3e06eb6addb32229bae94c960e2946c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1156b68bf74a1e6a384134df8aae6de37a85baa9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c76177c28be06b4676d4cfc29c9d51e132c88d59f9f1f18de61ef1d0ad426488 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..49c6aa9c6911c6e0207230c9932fdff6528b8862 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a097e50fb002ecd808587da0d513b7600feddbfe54140d1afd35f54d81ae974 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..35f4beb5ec1490e09e1f60d088fa916c080a994f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14292654b4b970cbe29282170061263fd878014175c090c19c21eb65c6808883 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7499bebc8e24b0b6efe3636f4964f8d0c1e4c43c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/trainer_state.json @@ -0,0 +1,5072 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 7131, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + }, + { + "epoch": 3.003786285233488, + "grad_norm": 0.4523724615573883, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3570 + }, + { + "epoch": 3.0122002524190155, + "grad_norm": 0.5261380076408386, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3580 + }, + { + "epoch": 3.0206142196045436, + "grad_norm": 0.48664888739585876, + "learning_rate": 0.0002, + "loss": 1.4946, + "step": 3590 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 0.5070882439613342, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 3600 + }, + { + "epoch": 3.0374421539755994, + "grad_norm": 0.5816011428833008, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 3610 + }, + { + "epoch": 3.0458561211611275, + "grad_norm": 0.6610211730003357, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3620 + }, + { + "epoch": 3.0542700883466556, + "grad_norm": 0.5257703065872192, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3630 + }, + { + "epoch": 3.0626840555321833, + "grad_norm": 0.5574390888214111, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 3640 + }, + { + "epoch": 3.0710980227177114, + "grad_norm": 0.5682297348976135, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3650 + }, + { + "epoch": 3.0795119899032395, + "grad_norm": 0.5798383355140686, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3660 + }, + { + "epoch": 3.087925957088767, + "grad_norm": 0.5458289980888367, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 3670 + }, + { + "epoch": 3.0963399242742953, + "grad_norm": 0.5599102973937988, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 3680 + }, + { + "epoch": 3.1047538914598234, + "grad_norm": 0.5023021697998047, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 3690 + }, + { + "epoch": 3.113167858645351, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3700 + }, + { + "epoch": 3.121581825830879, + "grad_norm": 0.5760458707809448, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 3710 + }, + { + "epoch": 3.1299957930164073, + "grad_norm": 0.6018968224525452, + "learning_rate": 0.0002, + "loss": 1.4789, + "step": 3720 + }, + { + "epoch": 3.1384097602019354, + "grad_norm": 0.5767101049423218, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3730 + }, + { + "epoch": 3.146823727387463, + "grad_norm": 0.5333963632583618, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 3740 + }, + { + "epoch": 3.155237694572991, + "grad_norm": 0.5918396711349487, + "learning_rate": 0.0002, + "loss": 1.4812, + "step": 3750 + }, + { + "epoch": 3.1636516617585193, + "grad_norm": 0.5931203365325928, + "learning_rate": 0.0002, + "loss": 1.4618, + "step": 3760 + }, + { + "epoch": 3.172065628944047, + "grad_norm": 0.6562168598175049, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 3770 + }, + { + "epoch": 3.180479596129575, + "grad_norm": 0.5820156335830688, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 3780 + }, + { + "epoch": 3.188893563315103, + "grad_norm": 0.5784737467765808, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 3790 + }, + { + "epoch": 3.197307530500631, + "grad_norm": 0.5506529808044434, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 3800 + }, + { + "epoch": 3.205721497686159, + "grad_norm": 0.6101595163345337, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3810 + }, + { + "epoch": 3.214135464871687, + "grad_norm": 0.5597806572914124, + "learning_rate": 0.0002, + "loss": 1.5185, + "step": 3820 + }, + { + "epoch": 3.222549432057215, + "grad_norm": 0.5641011595726013, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 3830 + }, + { + "epoch": 3.230963399242743, + "grad_norm": 0.5892080068588257, + "learning_rate": 0.0002, + "loss": 1.4702, + "step": 3840 + }, + { + "epoch": 3.239377366428271, + "grad_norm": 0.6034760475158691, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 3850 + }, + { + "epoch": 3.247791333613799, + "grad_norm": 0.5112439393997192, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3860 + }, + { + "epoch": 3.256205300799327, + "grad_norm": 0.56565922498703, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 3870 + }, + { + "epoch": 3.264619267984855, + "grad_norm": 0.6155247092247009, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 3880 + }, + { + "epoch": 3.273033235170383, + "grad_norm": 0.6064623594284058, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 3890 + }, + { + "epoch": 3.2814472023559107, + "grad_norm": 0.6313768029212952, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 3900 + }, + { + "epoch": 3.289861169541439, + "grad_norm": 0.5903939008712769, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3910 + }, + { + "epoch": 3.298275136726967, + "grad_norm": 0.5770667195320129, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3920 + }, + { + "epoch": 3.3066891039124946, + "grad_norm": 0.5785196423530579, + "learning_rate": 0.0002, + "loss": 1.5159, + "step": 3930 + }, + { + "epoch": 3.3151030710980227, + "grad_norm": 0.6468310356140137, + "learning_rate": 0.0002, + "loss": 1.5277, + "step": 3940 + }, + { + "epoch": 3.323517038283551, + "grad_norm": 0.6200279593467712, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3950 + }, + { + "epoch": 3.3319310054690785, + "grad_norm": 0.5779302716255188, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3960 + }, + { + "epoch": 3.3403449726546066, + "grad_norm": 0.5463796854019165, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 3970 + }, + { + "epoch": 3.3487589398401347, + "grad_norm": 0.6117855906486511, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3980 + }, + { + "epoch": 3.357172907025663, + "grad_norm": 0.5554766058921814, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3990 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 0.6012870073318481, + "learning_rate": 0.0002, + "loss": 1.5004, + "step": 4000 + }, + { + "epoch": 3.3740008413967186, + "grad_norm": 0.5443974137306213, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 4010 + }, + { + "epoch": 3.3824148085822463, + "grad_norm": 0.6636057496070862, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4020 + }, + { + "epoch": 3.3908287757677744, + "grad_norm": 0.5801246166229248, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 4030 + }, + { + "epoch": 3.3992427429533025, + "grad_norm": 0.5668839812278748, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4040 + }, + { + "epoch": 3.4076567101388306, + "grad_norm": 0.7763481736183167, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 4050 + }, + { + "epoch": 3.4160706773243583, + "grad_norm": 0.6675992608070374, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4060 + }, + { + "epoch": 3.4244846445098864, + "grad_norm": 0.6290077567100525, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 4070 + }, + { + "epoch": 3.4328986116954145, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4080 + }, + { + "epoch": 3.441312578880942, + "grad_norm": 0.6237877607345581, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4090 + }, + { + "epoch": 3.4497265460664703, + "grad_norm": 0.5343508124351501, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4100 + }, + { + "epoch": 3.4581405132519984, + "grad_norm": 0.6817412972450256, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 4110 + }, + { + "epoch": 3.466554480437526, + "grad_norm": 0.7115170359611511, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 4120 + }, + { + "epoch": 3.4749684476230542, + "grad_norm": 0.6127332448959351, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4130 + }, + { + "epoch": 3.4833824148085824, + "grad_norm": 0.5745994448661804, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4140 + }, + { + "epoch": 3.49179638199411, + "grad_norm": 0.6248795390129089, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 3.500210349179638, + "grad_norm": 0.5821124911308289, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 4160 + }, + { + "epoch": 3.5086243163651663, + "grad_norm": 0.561416506767273, + "learning_rate": 0.0002, + "loss": 1.4937, + "step": 4170 + }, + { + "epoch": 3.5170382835506944, + "grad_norm": 0.5848962664604187, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4180 + }, + { + "epoch": 3.525452250736222, + "grad_norm": 0.5335569977760315, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4190 + }, + { + "epoch": 3.53386621792175, + "grad_norm": 0.547964870929718, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 4200 + }, + { + "epoch": 3.542280185107278, + "grad_norm": 0.6157727241516113, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 4210 + }, + { + "epoch": 3.550694152292806, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4220 + }, + { + "epoch": 3.559108119478334, + "grad_norm": 0.5844616293907166, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4230 + }, + { + "epoch": 3.567522086663862, + "grad_norm": 0.7104926109313965, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4240 + }, + { + "epoch": 3.57593605384939, + "grad_norm": 0.5055213570594788, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4250 + }, + { + "epoch": 3.584350021034918, + "grad_norm": 0.611676812171936, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 4260 + }, + { + "epoch": 3.592763988220446, + "grad_norm": 0.6326440572738647, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4270 + }, + { + "epoch": 3.6011779554059737, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4280 + }, + { + "epoch": 3.609591922591502, + "grad_norm": 0.5691978931427002, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 4290 + }, + { + "epoch": 3.61800588977703, + "grad_norm": 0.6071329116821289, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 4300 + }, + { + "epoch": 3.626419856962558, + "grad_norm": 0.606573224067688, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 4310 + }, + { + "epoch": 3.6348338241480858, + "grad_norm": 0.5515419244766235, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 4320 + }, + { + "epoch": 3.643247791333614, + "grad_norm": 0.5964660048484802, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4330 + }, + { + "epoch": 3.6516617585191415, + "grad_norm": 0.5774146914482117, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 4340 + }, + { + "epoch": 3.6600757257046697, + "grad_norm": 0.5732731223106384, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 4350 + }, + { + "epoch": 3.6684896928901978, + "grad_norm": 0.7354163527488708, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 3.676903660075726, + "grad_norm": 0.6220902800559998, + "learning_rate": 0.0002, + "loss": 1.5225, + "step": 4370 + }, + { + "epoch": 3.6853176272612536, + "grad_norm": 0.6053991317749023, + "learning_rate": 0.0002, + "loss": 1.4838, + "step": 4380 + }, + { + "epoch": 3.6937315944467817, + "grad_norm": 0.67010897397995, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4390 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 0.6139186024665833, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4400 + }, + { + "epoch": 3.7105595288178375, + "grad_norm": 0.5433071851730347, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 4410 + }, + { + "epoch": 3.7189734960033656, + "grad_norm": 0.5453870296478271, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 4420 + }, + { + "epoch": 3.7273874631888937, + "grad_norm": 0.6401727199554443, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 4430 + }, + { + "epoch": 3.735801430374422, + "grad_norm": 0.6049367189407349, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4440 + }, + { + "epoch": 3.7442153975599495, + "grad_norm": 0.5740529298782349, + "learning_rate": 0.0002, + "loss": 1.5268, + "step": 4450 + }, + { + "epoch": 3.7526293647454776, + "grad_norm": 0.6521880626678467, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 4460 + }, + { + "epoch": 3.7610433319310053, + "grad_norm": 0.7096368074417114, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4470 + }, + { + "epoch": 3.7694572991165334, + "grad_norm": 0.5886474251747131, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 4480 + }, + { + "epoch": 3.7778712663020615, + "grad_norm": 0.5821043252944946, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4490 + }, + { + "epoch": 3.7862852334875896, + "grad_norm": 0.628892183303833, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4500 + }, + { + "epoch": 3.7946992006731173, + "grad_norm": 0.5962669849395752, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4510 + }, + { + "epoch": 3.8031131678586454, + "grad_norm": 0.6635549068450928, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4520 + }, + { + "epoch": 3.811527135044173, + "grad_norm": 0.6010760068893433, + "learning_rate": 0.0002, + "loss": 1.5058, + "step": 4530 + }, + { + "epoch": 3.819941102229701, + "grad_norm": 0.6322658658027649, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 4540 + }, + { + "epoch": 3.8283550694152293, + "grad_norm": 0.5893137454986572, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 4550 + }, + { + "epoch": 3.8367690366007574, + "grad_norm": 0.7829602360725403, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 4560 + }, + { + "epoch": 3.845183003786285, + "grad_norm": 0.6190396547317505, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4570 + }, + { + "epoch": 3.853596970971813, + "grad_norm": 0.6662813425064087, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 4580 + }, + { + "epoch": 3.8620109381573413, + "grad_norm": 0.5809855461120605, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4590 + }, + { + "epoch": 3.870424905342869, + "grad_norm": 0.5779069662094116, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4600 + }, + { + "epoch": 3.878838872528397, + "grad_norm": 0.5603038668632507, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4610 + }, + { + "epoch": 3.887252839713925, + "grad_norm": 0.6274181008338928, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 4620 + }, + { + "epoch": 3.8956668068994533, + "grad_norm": 0.6810959577560425, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 4630 + }, + { + "epoch": 3.904080774084981, + "grad_norm": 0.5647315979003906, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 4640 + }, + { + "epoch": 3.912494741270509, + "grad_norm": 0.6830295324325562, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4650 + }, + { + "epoch": 3.920908708456037, + "grad_norm": 0.652565598487854, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 4660 + }, + { + "epoch": 3.929322675641565, + "grad_norm": 0.5806284546852112, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 4670 + }, + { + "epoch": 3.937736642827093, + "grad_norm": 0.6825073957443237, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4680 + }, + { + "epoch": 3.946150610012621, + "grad_norm": 0.6149451732635498, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 4690 + }, + { + "epoch": 3.954564577198149, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4700 + }, + { + "epoch": 3.962978544383677, + "grad_norm": 0.6239011883735657, + "learning_rate": 0.0002, + "loss": 1.4897, + "step": 4710 + }, + { + "epoch": 3.971392511569205, + "grad_norm": 0.6485443115234375, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4720 + }, + { + "epoch": 3.9798064787547327, + "grad_norm": 0.6449228525161743, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4730 + }, + { + "epoch": 3.988220445940261, + "grad_norm": 0.6526407599449158, + "learning_rate": 0.0002, + "loss": 1.5087, + "step": 4740 + }, + { + "epoch": 3.996634413125789, + "grad_norm": 0.6277706027030945, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4750 + }, + { + "epoch": 4.0, + "eval_loss": 1.871641755104065, + "eval_runtime": 37.9637, + "eval_samples_per_second": 13.566, + "eval_steps_per_second": 1.712, + "step": 4754 + }, + { + "epoch": 4.005048380311317, + "grad_norm": 0.6994837522506714, + "learning_rate": 0.0002, + "loss": 1.4744, + "step": 4760 + }, + { + "epoch": 4.013462347496845, + "grad_norm": 0.8728373050689697, + "learning_rate": 0.0002, + "loss": 1.4433, + "step": 4770 + }, + { + "epoch": 4.021876314682372, + "grad_norm": 0.688679575920105, + "learning_rate": 0.0002, + "loss": 1.3329, + "step": 4780 + }, + { + "epoch": 4.0302902818679005, + "grad_norm": 0.6313387155532837, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 4790 + }, + { + "epoch": 4.038704249053429, + "grad_norm": 0.6577984690666199, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 4800 + }, + { + "epoch": 4.047118216238957, + "grad_norm": 0.7938185930252075, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4810 + }, + { + "epoch": 4.055532183424485, + "grad_norm": 0.760399580001831, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4820 + }, + { + "epoch": 4.063946150610013, + "grad_norm": 0.7329602241516113, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 4830 + }, + { + "epoch": 4.07236011779554, + "grad_norm": 0.7778576016426086, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 4840 + }, + { + "epoch": 4.080774084981068, + "grad_norm": 0.8235865235328674, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 4850 + }, + { + "epoch": 4.089188052166596, + "grad_norm": 0.7743754386901855, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 4860 + }, + { + "epoch": 4.0976020193521245, + "grad_norm": 0.8145367503166199, + "learning_rate": 0.0002, + "loss": 1.3787, + "step": 4870 + }, + { + "epoch": 4.106015986537653, + "grad_norm": 0.8517307639122009, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 4880 + }, + { + "epoch": 4.114429953723181, + "grad_norm": 0.8208953142166138, + "learning_rate": 0.0002, + "loss": 1.4191, + "step": 4890 + }, + { + "epoch": 4.122843920908709, + "grad_norm": 0.8437790870666504, + "learning_rate": 0.0002, + "loss": 1.3189, + "step": 4900 + }, + { + "epoch": 4.131257888094236, + "grad_norm": 0.716672420501709, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4910 + }, + { + "epoch": 4.139671855279764, + "grad_norm": 0.7656235098838806, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 4920 + }, + { + "epoch": 4.148085822465292, + "grad_norm": 0.7209306955337524, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 4930 + }, + { + "epoch": 4.1564997896508205, + "grad_norm": 0.7731267809867859, + "learning_rate": 0.0002, + "loss": 1.3639, + "step": 4940 + }, + { + "epoch": 4.164913756836349, + "grad_norm": 0.7477553486824036, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 4950 + }, + { + "epoch": 4.173327724021877, + "grad_norm": 0.7372981309890747, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 4960 + }, + { + "epoch": 4.181741691207404, + "grad_norm": 0.6582154035568237, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4970 + }, + { + "epoch": 4.190155658392932, + "grad_norm": 0.7003206610679626, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 4980 + }, + { + "epoch": 4.19856962557846, + "grad_norm": 0.735223650932312, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 4990 + }, + { + "epoch": 4.206983592763988, + "grad_norm": 0.7832302451133728, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 5000 + }, + { + "epoch": 4.215397559949516, + "grad_norm": 0.8819546103477478, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5010 + }, + { + "epoch": 4.2238115271350445, + "grad_norm": 0.9325336813926697, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 5020 + }, + { + "epoch": 4.232225494320572, + "grad_norm": 0.7007517218589783, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 5030 + }, + { + "epoch": 4.2406394615061, + "grad_norm": 0.7118321061134338, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 5040 + }, + { + "epoch": 4.249053428691628, + "grad_norm": 0.6578946709632874, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5050 + }, + { + "epoch": 4.257467395877156, + "grad_norm": 0.9438983798027039, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 5060 + }, + { + "epoch": 4.265881363062684, + "grad_norm": 0.703037679195404, + "learning_rate": 0.0002, + "loss": 1.3868, + "step": 5070 + }, + { + "epoch": 4.274295330248212, + "grad_norm": 0.7286025285720825, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 5080 + }, + { + "epoch": 4.28270929743374, + "grad_norm": 0.750689685344696, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5090 + }, + { + "epoch": 4.291123264619268, + "grad_norm": 0.869753360748291, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 5100 + }, + { + "epoch": 4.299537231804796, + "grad_norm": 0.8712980151176453, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 5110 + }, + { + "epoch": 4.307951198990324, + "grad_norm": 0.690263569355011, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 5120 + }, + { + "epoch": 4.316365166175852, + "grad_norm": 0.7114760279655457, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 5130 + }, + { + "epoch": 4.32477913336138, + "grad_norm": 0.7588112354278564, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5140 + }, + { + "epoch": 4.333193100546908, + "grad_norm": 0.7556202411651611, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 5150 + }, + { + "epoch": 4.341607067732435, + "grad_norm": 0.8357610702514648, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5160 + }, + { + "epoch": 4.3500210349179635, + "grad_norm": 0.8054035902023315, + "learning_rate": 0.0002, + "loss": 1.3348, + "step": 5170 + }, + { + "epoch": 4.358435002103492, + "grad_norm": 0.7637107968330383, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5180 + }, + { + "epoch": 4.36684896928902, + "grad_norm": 0.757481038570404, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 5190 + }, + { + "epoch": 4.375262936474548, + "grad_norm": 0.7185863852500916, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5200 + }, + { + "epoch": 4.383676903660076, + "grad_norm": 0.7326455116271973, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 5210 + }, + { + "epoch": 4.392090870845603, + "grad_norm": 0.7980523109436035, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 5220 + }, + { + "epoch": 4.400504838031131, + "grad_norm": 0.8526999354362488, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 5230 + }, + { + "epoch": 4.4089188052166595, + "grad_norm": 0.7012337446212769, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 5240 + }, + { + "epoch": 4.417332772402188, + "grad_norm": 0.8217827677726746, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 5250 + }, + { + "epoch": 4.425746739587716, + "grad_norm": 0.7141005396842957, + "learning_rate": 0.0002, + "loss": 1.3482, + "step": 5260 + }, + { + "epoch": 4.434160706773244, + "grad_norm": 0.7094302177429199, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 5270 + }, + { + "epoch": 4.442574673958772, + "grad_norm": 0.7234613299369812, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 5280 + }, + { + "epoch": 4.450988641144299, + "grad_norm": 0.7530457973480225, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5290 + }, + { + "epoch": 4.459402608329827, + "grad_norm": 0.7300912141799927, + "learning_rate": 0.0002, + "loss": 1.3944, + "step": 5300 + }, + { + "epoch": 4.467816575515355, + "grad_norm": 0.825443685054779, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 5310 + }, + { + "epoch": 4.4762305427008835, + "grad_norm": 0.7559658885002136, + "learning_rate": 0.0002, + "loss": 1.3648, + "step": 5320 + }, + { + "epoch": 4.484644509886412, + "grad_norm": 0.8817561268806458, + "learning_rate": 0.0002, + "loss": 1.4364, + "step": 5330 + }, + { + "epoch": 4.49305847707194, + "grad_norm": 0.8203575611114502, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 5340 + }, + { + "epoch": 4.501472444257468, + "grad_norm": 0.7677690982818604, + "learning_rate": 0.0002, + "loss": 1.3996, + "step": 5350 + }, + { + "epoch": 4.509886411442995, + "grad_norm": 0.657085120677948, + "learning_rate": 0.0002, + "loss": 1.4142, + "step": 5360 + }, + { + "epoch": 4.518300378628523, + "grad_norm": 0.7939504384994507, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 5370 + }, + { + "epoch": 4.526714345814051, + "grad_norm": 0.6971889138221741, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5380 + }, + { + "epoch": 4.535128312999579, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 5390 + }, + { + "epoch": 4.5435422801851075, + "grad_norm": 0.8504858613014221, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 5400 + }, + { + "epoch": 4.551956247370635, + "grad_norm": 0.9134073853492737, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5410 + }, + { + "epoch": 4.560370214556163, + "grad_norm": 0.7765598893165588, + "learning_rate": 0.0002, + "loss": 1.4375, + "step": 5420 + }, + { + "epoch": 4.568784181741691, + "grad_norm": 0.6991009712219238, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 5430 + }, + { + "epoch": 4.577198148927219, + "grad_norm": 0.8393039107322693, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5440 + }, + { + "epoch": 4.585612116112747, + "grad_norm": 0.7685918211936951, + "learning_rate": 0.0002, + "loss": 1.3976, + "step": 5450 + }, + { + "epoch": 4.594026083298275, + "grad_norm": 0.7135679721832275, + "learning_rate": 0.0002, + "loss": 1.3883, + "step": 5460 + }, + { + "epoch": 4.6024400504838034, + "grad_norm": 0.6728870868682861, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 5470 + }, + { + "epoch": 4.610854017669331, + "grad_norm": 0.7139479517936707, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5480 + }, + { + "epoch": 4.619267984854859, + "grad_norm": 0.8476598858833313, + "learning_rate": 0.0002, + "loss": 1.3498, + "step": 5490 + }, + { + "epoch": 4.627681952040387, + "grad_norm": 0.8034361004829407, + "learning_rate": 0.0002, + "loss": 1.3389, + "step": 5500 + }, + { + "epoch": 4.636095919225915, + "grad_norm": 0.7452183961868286, + "learning_rate": 0.0002, + "loss": 1.4179, + "step": 5510 + }, + { + "epoch": 4.644509886411443, + "grad_norm": 0.8394148945808411, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5520 + }, + { + "epoch": 4.652923853596971, + "grad_norm": 0.7480153441429138, + "learning_rate": 0.0002, + "loss": 1.4561, + "step": 5530 + }, + { + "epoch": 4.661337820782499, + "grad_norm": 0.7781714797019958, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5540 + }, + { + "epoch": 4.669751787968027, + "grad_norm": 1.0058213472366333, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 5550 + }, + { + "epoch": 4.678165755153555, + "grad_norm": 0.7403179407119751, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 5560 + }, + { + "epoch": 4.686579722339083, + "grad_norm": 0.7270476818084717, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 5570 + }, + { + "epoch": 4.694993689524611, + "grad_norm": 0.760877788066864, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5580 + }, + { + "epoch": 4.703407656710139, + "grad_norm": 0.8097004890441895, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5590 + }, + { + "epoch": 4.711821623895667, + "grad_norm": 0.9096523523330688, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 5600 + }, + { + "epoch": 4.720235591081195, + "grad_norm": 0.7262444496154785, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5610 + }, + { + "epoch": 4.7286495582667225, + "grad_norm": 0.8207762837409973, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 5620 + }, + { + "epoch": 4.737063525452251, + "grad_norm": 0.8089601993560791, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 5630 + }, + { + "epoch": 4.745477492637779, + "grad_norm": 0.7609543800354004, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 5640 + }, + { + "epoch": 4.753891459823307, + "grad_norm": 0.7273501753807068, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 5650 + }, + { + "epoch": 4.762305427008835, + "grad_norm": 0.7800219058990479, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5660 + }, + { + "epoch": 4.770719394194362, + "grad_norm": 0.8558377623558044, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 5670 + }, + { + "epoch": 4.77913336137989, + "grad_norm": 0.7131547927856445, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 5680 + }, + { + "epoch": 4.787547328565418, + "grad_norm": 0.7651025056838989, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 5690 + }, + { + "epoch": 4.7959612957509465, + "grad_norm": 0.8129976391792297, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 5700 + }, + { + "epoch": 4.804375262936475, + "grad_norm": 0.8019895553588867, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 5710 + }, + { + "epoch": 4.812789230122003, + "grad_norm": 0.7692018151283264, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 5720 + }, + { + "epoch": 4.821203197307531, + "grad_norm": 0.6893943548202515, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 5730 + }, + { + "epoch": 4.829617164493058, + "grad_norm": 0.6881810426712036, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5740 + }, + { + "epoch": 4.838031131678586, + "grad_norm": 0.7838267683982849, + "learning_rate": 0.0002, + "loss": 1.4775, + "step": 5750 + }, + { + "epoch": 4.846445098864114, + "grad_norm": 0.727799117565155, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5760 + }, + { + "epoch": 4.8548590660496425, + "grad_norm": 0.7458277344703674, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5770 + }, + { + "epoch": 4.863273033235171, + "grad_norm": 0.903802216053009, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 5780 + }, + { + "epoch": 4.871687000420699, + "grad_norm": 0.7983472347259521, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 5790 + }, + { + "epoch": 4.880100967606227, + "grad_norm": 0.6894361972808838, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 5800 + }, + { + "epoch": 4.888514934791754, + "grad_norm": 0.7499409317970276, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 5810 + }, + { + "epoch": 4.896928901977282, + "grad_norm": 0.7362820506095886, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 5820 + }, + { + "epoch": 4.90534286916281, + "grad_norm": 0.8341619968414307, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 5830 + }, + { + "epoch": 4.913756836348338, + "grad_norm": 0.9604470133781433, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5840 + }, + { + "epoch": 4.9221708035338665, + "grad_norm": 0.8916844129562378, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5850 + }, + { + "epoch": 4.930584770719394, + "grad_norm": 0.8519647121429443, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5860 + }, + { + "epoch": 4.938998737904922, + "grad_norm": 0.7946906089782715, + "learning_rate": 0.0002, + "loss": 1.424, + "step": 5870 + }, + { + "epoch": 4.94741270509045, + "grad_norm": 0.7843789458274841, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 5880 + }, + { + "epoch": 4.955826672275978, + "grad_norm": 0.707618772983551, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5890 + }, + { + "epoch": 4.964240639461506, + "grad_norm": 0.7704206109046936, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 5900 + }, + { + "epoch": 4.972654606647034, + "grad_norm": 0.7160256505012512, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5910 + }, + { + "epoch": 4.981068573832562, + "grad_norm": 0.7020420432090759, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 5920 + }, + { + "epoch": 4.98948254101809, + "grad_norm": 0.7576286792755127, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 5930 + }, + { + "epoch": 4.997896508203618, + "grad_norm": 0.8573036789894104, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 5940 + }, + { + "epoch": 4.999579301640724, + "eval_loss": 1.9353811740875244, + "eval_runtime": 37.9208, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 5942 + }, + { + "epoch": 5.006310475389146, + "grad_norm": 0.8204267621040344, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5950 + }, + { + "epoch": 5.014724442574674, + "grad_norm": 0.976840615272522, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 5960 + }, + { + "epoch": 5.023138409760202, + "grad_norm": 0.8765613436698914, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5970 + }, + { + "epoch": 5.03155237694573, + "grad_norm": 1.1793042421340942, + "learning_rate": 0.0002, + "loss": 1.2748, + "step": 5980 + }, + { + "epoch": 5.039966344131258, + "grad_norm": 0.971062958240509, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 5990 + }, + { + "epoch": 5.0483803113167856, + "grad_norm": 0.8649757504463196, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 6000 + }, + { + "epoch": 5.056794278502314, + "grad_norm": 0.9563034176826477, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 6010 + }, + { + "epoch": 5.065208245687842, + "grad_norm": 1.0093994140625, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 6020 + }, + { + "epoch": 5.07362221287337, + "grad_norm": 1.004213571548462, + "learning_rate": 0.0002, + "loss": 1.2519, + "step": 6030 + }, + { + "epoch": 5.082036180058898, + "grad_norm": 0.8307787179946899, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 6040 + }, + { + "epoch": 5.090450147244426, + "grad_norm": 0.9117848873138428, + "learning_rate": 0.0002, + "loss": 1.2282, + "step": 6050 + }, + { + "epoch": 5.098864114429953, + "grad_norm": 1.0269840955734253, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6060 + }, + { + "epoch": 5.1072780816154815, + "grad_norm": 0.9079542756080627, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6070 + }, + { + "epoch": 5.11569204880101, + "grad_norm": 0.885702908039093, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6080 + }, + { + "epoch": 5.124106015986538, + "grad_norm": 0.9976128339767456, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 6090 + }, + { + "epoch": 5.132519983172066, + "grad_norm": 0.8472117185592651, + "learning_rate": 0.0002, + "loss": 1.3082, + "step": 6100 + }, + { + "epoch": 5.140933950357594, + "grad_norm": 1.0385161638259888, + "learning_rate": 0.0002, + "loss": 1.226, + "step": 6110 + }, + { + "epoch": 5.149347917543121, + "grad_norm": 0.8948383927345276, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 6120 + }, + { + "epoch": 5.157761884728649, + "grad_norm": 1.2613716125488281, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6130 + }, + { + "epoch": 5.166175851914177, + "grad_norm": 0.9933410286903381, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6140 + }, + { + "epoch": 5.1745898190997055, + "grad_norm": 0.9673663973808289, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6150 + }, + { + "epoch": 5.183003786285234, + "grad_norm": 0.9969648122787476, + "learning_rate": 0.0002, + "loss": 1.2947, + "step": 6160 + }, + { + "epoch": 5.191417753470762, + "grad_norm": 1.2163258790969849, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 6170 + }, + { + "epoch": 5.19983172065629, + "grad_norm": 0.9163419604301453, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 6180 + }, + { + "epoch": 5.208245687841817, + "grad_norm": 0.9225585460662842, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6190 + }, + { + "epoch": 5.216659655027345, + "grad_norm": 0.9205296635627747, + "learning_rate": 0.0002, + "loss": 1.2932, + "step": 6200 + }, + { + "epoch": 5.225073622212873, + "grad_norm": 1.0655443668365479, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 6210 + }, + { + "epoch": 5.233487589398401, + "grad_norm": 1.0854865312576294, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6220 + }, + { + "epoch": 5.2419015565839295, + "grad_norm": 0.8489186763763428, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 6230 + }, + { + "epoch": 5.250315523769458, + "grad_norm": 0.910391628742218, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 6240 + }, + { + "epoch": 5.258729490954985, + "grad_norm": 0.925507128238678, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 6250 + }, + { + "epoch": 5.267143458140513, + "grad_norm": 1.1069735288619995, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 6260 + }, + { + "epoch": 5.275557425326041, + "grad_norm": 0.9705119132995605, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 6270 + }, + { + "epoch": 5.283971392511569, + "grad_norm": 0.9752426147460938, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6280 + }, + { + "epoch": 5.292385359697097, + "grad_norm": 1.021359920501709, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6290 + }, + { + "epoch": 5.3007993268826255, + "grad_norm": 1.148606300354004, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6300 + }, + { + "epoch": 5.309213294068153, + "grad_norm": 0.8909247517585754, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 6310 + }, + { + "epoch": 5.317627261253681, + "grad_norm": 0.9879156351089478, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 6320 + }, + { + "epoch": 5.326041228439209, + "grad_norm": 0.9473357200622559, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6330 + }, + { + "epoch": 5.334455195624737, + "grad_norm": 1.1422028541564941, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6340 + }, + { + "epoch": 5.342869162810265, + "grad_norm": 0.9942235350608826, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6350 + }, + { + "epoch": 5.351283129995793, + "grad_norm": 0.9535723924636841, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 6360 + }, + { + "epoch": 5.359697097181321, + "grad_norm": 0.9020892381668091, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6370 + }, + { + "epoch": 5.368111064366849, + "grad_norm": 1.0626472234725952, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6380 + }, + { + "epoch": 5.376525031552377, + "grad_norm": 1.1395848989486694, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 6390 + }, + { + "epoch": 5.384938998737905, + "grad_norm": 0.9274451732635498, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 6400 + }, + { + "epoch": 5.393352965923433, + "grad_norm": 0.8108699917793274, + "learning_rate": 0.0002, + "loss": 1.2819, + "step": 6410 + }, + { + "epoch": 5.401766933108961, + "grad_norm": 1.1805564165115356, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6420 + }, + { + "epoch": 5.410180900294489, + "grad_norm": 0.8321298360824585, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 6430 + }, + { + "epoch": 5.418594867480017, + "grad_norm": 0.8981925249099731, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 6440 + }, + { + "epoch": 5.4270088346655445, + "grad_norm": 1.0730986595153809, + "learning_rate": 0.0002, + "loss": 1.258, + "step": 6450 + }, + { + "epoch": 5.435422801851073, + "grad_norm": 1.0584609508514404, + "learning_rate": 0.0002, + "loss": 1.26, + "step": 6460 + }, + { + "epoch": 5.443836769036601, + "grad_norm": 1.0792299509048462, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 6470 + }, + { + "epoch": 5.452250736222129, + "grad_norm": 0.9101872444152832, + "learning_rate": 0.0002, + "loss": 1.2035, + "step": 6480 + }, + { + "epoch": 5.460664703407657, + "grad_norm": 0.9910100698471069, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6490 + }, + { + "epoch": 5.469078670593185, + "grad_norm": 1.041412353515625, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6500 + }, + { + "epoch": 5.477492637778712, + "grad_norm": 1.0091687440872192, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 6510 + }, + { + "epoch": 5.48590660496424, + "grad_norm": 0.8755383491516113, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6520 + }, + { + "epoch": 5.4943205721497685, + "grad_norm": 0.980212390422821, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 6530 + }, + { + "epoch": 5.502734539335297, + "grad_norm": 0.9356869459152222, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6540 + }, + { + "epoch": 5.511148506520825, + "grad_norm": 0.9008095264434814, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6550 + }, + { + "epoch": 5.519562473706353, + "grad_norm": 0.8908938765525818, + "learning_rate": 0.0002, + "loss": 1.2818, + "step": 6560 + }, + { + "epoch": 5.52797644089188, + "grad_norm": 1.1423932313919067, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6570 + }, + { + "epoch": 5.536390408077408, + "grad_norm": 1.0508161783218384, + "learning_rate": 0.0002, + "loss": 1.3039, + "step": 6580 + }, + { + "epoch": 5.544804375262936, + "grad_norm": 0.8357517719268799, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6590 + }, + { + "epoch": 5.5532183424484645, + "grad_norm": 0.9892540574073792, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6600 + }, + { + "epoch": 5.561632309633993, + "grad_norm": 1.0048326253890991, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6610 + }, + { + "epoch": 5.570046276819521, + "grad_norm": 0.9801995158195496, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 6620 + }, + { + "epoch": 5.578460244005049, + "grad_norm": 0.9899214506149292, + "learning_rate": 0.0002, + "loss": 1.2606, + "step": 6630 + }, + { + "epoch": 5.586874211190576, + "grad_norm": 1.1911814212799072, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6640 + }, + { + "epoch": 5.595288178376104, + "grad_norm": 1.0368894338607788, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 6650 + }, + { + "epoch": 5.603702145561632, + "grad_norm": 1.1248382329940796, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 6660 + }, + { + "epoch": 5.61211611274716, + "grad_norm": 0.9765539765357971, + "learning_rate": 0.0002, + "loss": 1.2548, + "step": 6670 + }, + { + "epoch": 5.6205300799326885, + "grad_norm": 0.9810206890106201, + "learning_rate": 0.0002, + "loss": 1.3451, + "step": 6680 + }, + { + "epoch": 5.628944047118217, + "grad_norm": 1.100386619567871, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6690 + }, + { + "epoch": 5.637358014303744, + "grad_norm": 0.8824519515037537, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6700 + }, + { + "epoch": 5.645771981489272, + "grad_norm": 1.0864064693450928, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6710 + }, + { + "epoch": 5.6541859486748, + "grad_norm": 1.1614511013031006, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 6720 + }, + { + "epoch": 5.662599915860328, + "grad_norm": 1.0762972831726074, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 6730 + }, + { + "epoch": 5.671013883045856, + "grad_norm": 0.9408974647521973, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 6740 + }, + { + "epoch": 5.679427850231384, + "grad_norm": 0.8906030058860779, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6750 + }, + { + "epoch": 5.687841817416912, + "grad_norm": 0.9527303576469421, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 6760 + }, + { + "epoch": 5.69625578460244, + "grad_norm": 0.9471196532249451, + "learning_rate": 0.0002, + "loss": 1.322, + "step": 6770 + }, + { + "epoch": 5.704669751787968, + "grad_norm": 0.9186838865280151, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6780 + }, + { + "epoch": 5.713083718973496, + "grad_norm": 0.9225441813468933, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 6790 + }, + { + "epoch": 5.721497686159024, + "grad_norm": 0.9712982773780823, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6800 + }, + { + "epoch": 5.729911653344552, + "grad_norm": 1.0743170976638794, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6810 + }, + { + "epoch": 5.73832562053008, + "grad_norm": 1.2738113403320312, + "learning_rate": 0.0002, + "loss": 1.2136, + "step": 6820 + }, + { + "epoch": 5.7467395877156076, + "grad_norm": 0.9386790990829468, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6830 + }, + { + "epoch": 5.755153554901136, + "grad_norm": 1.0817769765853882, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 6840 + }, + { + "epoch": 5.763567522086664, + "grad_norm": 1.1040263175964355, + "learning_rate": 0.0002, + "loss": 1.2247, + "step": 6850 + }, + { + "epoch": 5.771981489272192, + "grad_norm": 1.0656492710113525, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6860 + }, + { + "epoch": 5.78039545645772, + "grad_norm": 0.9550157189369202, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 6870 + }, + { + "epoch": 5.788809423643248, + "grad_norm": 1.0130870342254639, + "learning_rate": 0.0002, + "loss": 1.3201, + "step": 6880 + }, + { + "epoch": 5.797223390828776, + "grad_norm": 1.0675787925720215, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 6890 + }, + { + "epoch": 5.8056373580143035, + "grad_norm": 0.9537774920463562, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6900 + }, + { + "epoch": 5.814051325199832, + "grad_norm": 0.9640319347381592, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6910 + }, + { + "epoch": 5.82246529238536, + "grad_norm": 0.8917992115020752, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 6920 + }, + { + "epoch": 5.830879259570888, + "grad_norm": 0.9881822466850281, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 6930 + }, + { + "epoch": 5.839293226756416, + "grad_norm": 0.9136882424354553, + "learning_rate": 0.0002, + "loss": 1.323, + "step": 6940 + }, + { + "epoch": 5.847707193941943, + "grad_norm": 0.9086098074913025, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 6950 + }, + { + "epoch": 5.856121161127471, + "grad_norm": 0.9443018436431885, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6960 + }, + { + "epoch": 5.864535128312999, + "grad_norm": 0.9915381669998169, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 6970 + }, + { + "epoch": 5.8729490954985275, + "grad_norm": 0.8939146995544434, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 6980 + }, + { + "epoch": 5.881363062684056, + "grad_norm": 1.3672245740890503, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 6990 + }, + { + "epoch": 5.889777029869584, + "grad_norm": 1.0116257667541504, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 7000 + }, + { + "epoch": 5.898190997055112, + "grad_norm": 1.1561565399169922, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 7010 + }, + { + "epoch": 5.906604964240639, + "grad_norm": 0.9900678992271423, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 7020 + }, + { + "epoch": 5.915018931426167, + "grad_norm": 0.9297345876693726, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 7030 + }, + { + "epoch": 5.923432898611695, + "grad_norm": 0.9357825517654419, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 7040 + }, + { + "epoch": 5.931846865797223, + "grad_norm": 1.049317717552185, + "learning_rate": 0.0002, + "loss": 1.2303, + "step": 7050 + }, + { + "epoch": 5.9402608329827515, + "grad_norm": 0.950633704662323, + "learning_rate": 0.0002, + "loss": 1.3243, + "step": 7060 + }, + { + "epoch": 5.94867480016828, + "grad_norm": 0.854581892490387, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 7070 + }, + { + "epoch": 5.957088767353808, + "grad_norm": 0.9097039699554443, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 7080 + }, + { + "epoch": 5.965502734539335, + "grad_norm": 0.9072173237800598, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 7090 + }, + { + "epoch": 5.973916701724863, + "grad_norm": 1.0470727682113647, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 7100 + }, + { + "epoch": 5.982330668910391, + "grad_norm": 1.2628462314605713, + "learning_rate": 0.0002, + "loss": 1.3324, + "step": 7110 + }, + { + "epoch": 5.990744636095919, + "grad_norm": 1.055279016494751, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 7120 + }, + { + "epoch": 5.9991586032814475, + "grad_norm": 0.966194212436676, + "learning_rate": 0.0002, + "loss": 1.3234, + "step": 7130 + }, + { + "epoch": 6.0, + "eval_loss": 2.0427448749542236, + "eval_runtime": 37.8426, + "eval_samples_per_second": 13.609, + "eval_steps_per_second": 1.718, + "step": 7131 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.300067130443039e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-7131/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5322f8a46798c9f9f585b13e82028241b9621341 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aae23ae6783d269b6970fa7b8d2f0a1f2fab5480b7822808680363eda0810bb6 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c06afd6b3a5a404e779911f90d39ed7473457b7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35efff070d4b5549eccb52fcd5c5c01015d0c90e53e56776b01fc0bfa96e3464 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4761a05d0fc7027566474f86993a848fb9c150f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29b17e02e08ea002a69cf031f4a7f63c9e6cdb9a81b89ef25bcc4d24814d5dd +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f451e9b8f166df97b2436cbcc45efc9c1bf9ec87 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62b9f612cf8891fea8d0ca32ca5dd7d386a32839809ea28d01ed4efd8643f3f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0728e792f0d1d3c4dba5107b680cc3830e2d4ea --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/trainer_state.json @@ -0,0 +1,5906 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 6.999579301640724, + "eval_steps": 10, + "global_step": 8319, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + }, + { + "epoch": 3.003786285233488, + "grad_norm": 0.4523724615573883, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3570 + }, + { + "epoch": 3.0122002524190155, + "grad_norm": 0.5261380076408386, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3580 + }, + { + "epoch": 3.0206142196045436, + "grad_norm": 0.48664888739585876, + "learning_rate": 0.0002, + "loss": 1.4946, + "step": 3590 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 0.5070882439613342, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 3600 + }, + { + "epoch": 3.0374421539755994, + "grad_norm": 0.5816011428833008, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 3610 + }, + { + "epoch": 3.0458561211611275, + "grad_norm": 0.6610211730003357, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3620 + }, + { + "epoch": 3.0542700883466556, + "grad_norm": 0.5257703065872192, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3630 + }, + { + "epoch": 3.0626840555321833, + "grad_norm": 0.5574390888214111, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 3640 + }, + { + "epoch": 3.0710980227177114, + "grad_norm": 0.5682297348976135, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3650 + }, + { + "epoch": 3.0795119899032395, + "grad_norm": 0.5798383355140686, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3660 + }, + { + "epoch": 3.087925957088767, + "grad_norm": 0.5458289980888367, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 3670 + }, + { + "epoch": 3.0963399242742953, + "grad_norm": 0.5599102973937988, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 3680 + }, + { + "epoch": 3.1047538914598234, + "grad_norm": 0.5023021697998047, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 3690 + }, + { + "epoch": 3.113167858645351, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3700 + }, + { + "epoch": 3.121581825830879, + "grad_norm": 0.5760458707809448, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 3710 + }, + { + "epoch": 3.1299957930164073, + "grad_norm": 0.6018968224525452, + "learning_rate": 0.0002, + "loss": 1.4789, + "step": 3720 + }, + { + "epoch": 3.1384097602019354, + "grad_norm": 0.5767101049423218, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3730 + }, + { + "epoch": 3.146823727387463, + "grad_norm": 0.5333963632583618, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 3740 + }, + { + "epoch": 3.155237694572991, + "grad_norm": 0.5918396711349487, + "learning_rate": 0.0002, + "loss": 1.4812, + "step": 3750 + }, + { + "epoch": 3.1636516617585193, + "grad_norm": 0.5931203365325928, + "learning_rate": 0.0002, + "loss": 1.4618, + "step": 3760 + }, + { + "epoch": 3.172065628944047, + "grad_norm": 0.6562168598175049, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 3770 + }, + { + "epoch": 3.180479596129575, + "grad_norm": 0.5820156335830688, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 3780 + }, + { + "epoch": 3.188893563315103, + "grad_norm": 0.5784737467765808, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 3790 + }, + { + "epoch": 3.197307530500631, + "grad_norm": 0.5506529808044434, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 3800 + }, + { + "epoch": 3.205721497686159, + "grad_norm": 0.6101595163345337, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3810 + }, + { + "epoch": 3.214135464871687, + "grad_norm": 0.5597806572914124, + "learning_rate": 0.0002, + "loss": 1.5185, + "step": 3820 + }, + { + "epoch": 3.222549432057215, + "grad_norm": 0.5641011595726013, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 3830 + }, + { + "epoch": 3.230963399242743, + "grad_norm": 0.5892080068588257, + "learning_rate": 0.0002, + "loss": 1.4702, + "step": 3840 + }, + { + "epoch": 3.239377366428271, + "grad_norm": 0.6034760475158691, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 3850 + }, + { + "epoch": 3.247791333613799, + "grad_norm": 0.5112439393997192, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3860 + }, + { + "epoch": 3.256205300799327, + "grad_norm": 0.56565922498703, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 3870 + }, + { + "epoch": 3.264619267984855, + "grad_norm": 0.6155247092247009, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 3880 + }, + { + "epoch": 3.273033235170383, + "grad_norm": 0.6064623594284058, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 3890 + }, + { + "epoch": 3.2814472023559107, + "grad_norm": 0.6313768029212952, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 3900 + }, + { + "epoch": 3.289861169541439, + "grad_norm": 0.5903939008712769, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3910 + }, + { + "epoch": 3.298275136726967, + "grad_norm": 0.5770667195320129, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3920 + }, + { + "epoch": 3.3066891039124946, + "grad_norm": 0.5785196423530579, + "learning_rate": 0.0002, + "loss": 1.5159, + "step": 3930 + }, + { + "epoch": 3.3151030710980227, + "grad_norm": 0.6468310356140137, + "learning_rate": 0.0002, + "loss": 1.5277, + "step": 3940 + }, + { + "epoch": 3.323517038283551, + "grad_norm": 0.6200279593467712, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3950 + }, + { + "epoch": 3.3319310054690785, + "grad_norm": 0.5779302716255188, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3960 + }, + { + "epoch": 3.3403449726546066, + "grad_norm": 0.5463796854019165, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 3970 + }, + { + "epoch": 3.3487589398401347, + "grad_norm": 0.6117855906486511, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3980 + }, + { + "epoch": 3.357172907025663, + "grad_norm": 0.5554766058921814, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3990 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 0.6012870073318481, + "learning_rate": 0.0002, + "loss": 1.5004, + "step": 4000 + }, + { + "epoch": 3.3740008413967186, + "grad_norm": 0.5443974137306213, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 4010 + }, + { + "epoch": 3.3824148085822463, + "grad_norm": 0.6636057496070862, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4020 + }, + { + "epoch": 3.3908287757677744, + "grad_norm": 0.5801246166229248, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 4030 + }, + { + "epoch": 3.3992427429533025, + "grad_norm": 0.5668839812278748, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4040 + }, + { + "epoch": 3.4076567101388306, + "grad_norm": 0.7763481736183167, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 4050 + }, + { + "epoch": 3.4160706773243583, + "grad_norm": 0.6675992608070374, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4060 + }, + { + "epoch": 3.4244846445098864, + "grad_norm": 0.6290077567100525, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 4070 + }, + { + "epoch": 3.4328986116954145, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4080 + }, + { + "epoch": 3.441312578880942, + "grad_norm": 0.6237877607345581, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4090 + }, + { + "epoch": 3.4497265460664703, + "grad_norm": 0.5343508124351501, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4100 + }, + { + "epoch": 3.4581405132519984, + "grad_norm": 0.6817412972450256, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 4110 + }, + { + "epoch": 3.466554480437526, + "grad_norm": 0.7115170359611511, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 4120 + }, + { + "epoch": 3.4749684476230542, + "grad_norm": 0.6127332448959351, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4130 + }, + { + "epoch": 3.4833824148085824, + "grad_norm": 0.5745994448661804, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4140 + }, + { + "epoch": 3.49179638199411, + "grad_norm": 0.6248795390129089, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 3.500210349179638, + "grad_norm": 0.5821124911308289, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 4160 + }, + { + "epoch": 3.5086243163651663, + "grad_norm": 0.561416506767273, + "learning_rate": 0.0002, + "loss": 1.4937, + "step": 4170 + }, + { + "epoch": 3.5170382835506944, + "grad_norm": 0.5848962664604187, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4180 + }, + { + "epoch": 3.525452250736222, + "grad_norm": 0.5335569977760315, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4190 + }, + { + "epoch": 3.53386621792175, + "grad_norm": 0.547964870929718, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 4200 + }, + { + "epoch": 3.542280185107278, + "grad_norm": 0.6157727241516113, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 4210 + }, + { + "epoch": 3.550694152292806, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4220 + }, + { + "epoch": 3.559108119478334, + "grad_norm": 0.5844616293907166, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4230 + }, + { + "epoch": 3.567522086663862, + "grad_norm": 0.7104926109313965, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4240 + }, + { + "epoch": 3.57593605384939, + "grad_norm": 0.5055213570594788, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4250 + }, + { + "epoch": 3.584350021034918, + "grad_norm": 0.611676812171936, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 4260 + }, + { + "epoch": 3.592763988220446, + "grad_norm": 0.6326440572738647, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4270 + }, + { + "epoch": 3.6011779554059737, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4280 + }, + { + "epoch": 3.609591922591502, + "grad_norm": 0.5691978931427002, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 4290 + }, + { + "epoch": 3.61800588977703, + "grad_norm": 0.6071329116821289, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 4300 + }, + { + "epoch": 3.626419856962558, + "grad_norm": 0.606573224067688, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 4310 + }, + { + "epoch": 3.6348338241480858, + "grad_norm": 0.5515419244766235, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 4320 + }, + { + "epoch": 3.643247791333614, + "grad_norm": 0.5964660048484802, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4330 + }, + { + "epoch": 3.6516617585191415, + "grad_norm": 0.5774146914482117, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 4340 + }, + { + "epoch": 3.6600757257046697, + "grad_norm": 0.5732731223106384, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 4350 + }, + { + "epoch": 3.6684896928901978, + "grad_norm": 0.7354163527488708, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 3.676903660075726, + "grad_norm": 0.6220902800559998, + "learning_rate": 0.0002, + "loss": 1.5225, + "step": 4370 + }, + { + "epoch": 3.6853176272612536, + "grad_norm": 0.6053991317749023, + "learning_rate": 0.0002, + "loss": 1.4838, + "step": 4380 + }, + { + "epoch": 3.6937315944467817, + "grad_norm": 0.67010897397995, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4390 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 0.6139186024665833, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4400 + }, + { + "epoch": 3.7105595288178375, + "grad_norm": 0.5433071851730347, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 4410 + }, + { + "epoch": 3.7189734960033656, + "grad_norm": 0.5453870296478271, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 4420 + }, + { + "epoch": 3.7273874631888937, + "grad_norm": 0.6401727199554443, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 4430 + }, + { + "epoch": 3.735801430374422, + "grad_norm": 0.6049367189407349, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4440 + }, + { + "epoch": 3.7442153975599495, + "grad_norm": 0.5740529298782349, + "learning_rate": 0.0002, + "loss": 1.5268, + "step": 4450 + }, + { + "epoch": 3.7526293647454776, + "grad_norm": 0.6521880626678467, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 4460 + }, + { + "epoch": 3.7610433319310053, + "grad_norm": 0.7096368074417114, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4470 + }, + { + "epoch": 3.7694572991165334, + "grad_norm": 0.5886474251747131, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 4480 + }, + { + "epoch": 3.7778712663020615, + "grad_norm": 0.5821043252944946, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4490 + }, + { + "epoch": 3.7862852334875896, + "grad_norm": 0.628892183303833, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4500 + }, + { + "epoch": 3.7946992006731173, + "grad_norm": 0.5962669849395752, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4510 + }, + { + "epoch": 3.8031131678586454, + "grad_norm": 0.6635549068450928, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4520 + }, + { + "epoch": 3.811527135044173, + "grad_norm": 0.6010760068893433, + "learning_rate": 0.0002, + "loss": 1.5058, + "step": 4530 + }, + { + "epoch": 3.819941102229701, + "grad_norm": 0.6322658658027649, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 4540 + }, + { + "epoch": 3.8283550694152293, + "grad_norm": 0.5893137454986572, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 4550 + }, + { + "epoch": 3.8367690366007574, + "grad_norm": 0.7829602360725403, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 4560 + }, + { + "epoch": 3.845183003786285, + "grad_norm": 0.6190396547317505, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4570 + }, + { + "epoch": 3.853596970971813, + "grad_norm": 0.6662813425064087, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 4580 + }, + { + "epoch": 3.8620109381573413, + "grad_norm": 0.5809855461120605, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4590 + }, + { + "epoch": 3.870424905342869, + "grad_norm": 0.5779069662094116, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4600 + }, + { + "epoch": 3.878838872528397, + "grad_norm": 0.5603038668632507, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4610 + }, + { + "epoch": 3.887252839713925, + "grad_norm": 0.6274181008338928, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 4620 + }, + { + "epoch": 3.8956668068994533, + "grad_norm": 0.6810959577560425, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 4630 + }, + { + "epoch": 3.904080774084981, + "grad_norm": 0.5647315979003906, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 4640 + }, + { + "epoch": 3.912494741270509, + "grad_norm": 0.6830295324325562, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4650 + }, + { + "epoch": 3.920908708456037, + "grad_norm": 0.652565598487854, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 4660 + }, + { + "epoch": 3.929322675641565, + "grad_norm": 0.5806284546852112, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 4670 + }, + { + "epoch": 3.937736642827093, + "grad_norm": 0.6825073957443237, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4680 + }, + { + "epoch": 3.946150610012621, + "grad_norm": 0.6149451732635498, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 4690 + }, + { + "epoch": 3.954564577198149, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4700 + }, + { + "epoch": 3.962978544383677, + "grad_norm": 0.6239011883735657, + "learning_rate": 0.0002, + "loss": 1.4897, + "step": 4710 + }, + { + "epoch": 3.971392511569205, + "grad_norm": 0.6485443115234375, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4720 + }, + { + "epoch": 3.9798064787547327, + "grad_norm": 0.6449228525161743, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4730 + }, + { + "epoch": 3.988220445940261, + "grad_norm": 0.6526407599449158, + "learning_rate": 0.0002, + "loss": 1.5087, + "step": 4740 + }, + { + "epoch": 3.996634413125789, + "grad_norm": 0.6277706027030945, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4750 + }, + { + "epoch": 4.0, + "eval_loss": 1.871641755104065, + "eval_runtime": 37.9637, + "eval_samples_per_second": 13.566, + "eval_steps_per_second": 1.712, + "step": 4754 + }, + { + "epoch": 4.005048380311317, + "grad_norm": 0.6994837522506714, + "learning_rate": 0.0002, + "loss": 1.4744, + "step": 4760 + }, + { + "epoch": 4.013462347496845, + "grad_norm": 0.8728373050689697, + "learning_rate": 0.0002, + "loss": 1.4433, + "step": 4770 + }, + { + "epoch": 4.021876314682372, + "grad_norm": 0.688679575920105, + "learning_rate": 0.0002, + "loss": 1.3329, + "step": 4780 + }, + { + "epoch": 4.0302902818679005, + "grad_norm": 0.6313387155532837, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 4790 + }, + { + "epoch": 4.038704249053429, + "grad_norm": 0.6577984690666199, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 4800 + }, + { + "epoch": 4.047118216238957, + "grad_norm": 0.7938185930252075, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4810 + }, + { + "epoch": 4.055532183424485, + "grad_norm": 0.760399580001831, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4820 + }, + { + "epoch": 4.063946150610013, + "grad_norm": 0.7329602241516113, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 4830 + }, + { + "epoch": 4.07236011779554, + "grad_norm": 0.7778576016426086, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 4840 + }, + { + "epoch": 4.080774084981068, + "grad_norm": 0.8235865235328674, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 4850 + }, + { + "epoch": 4.089188052166596, + "grad_norm": 0.7743754386901855, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 4860 + }, + { + "epoch": 4.0976020193521245, + "grad_norm": 0.8145367503166199, + "learning_rate": 0.0002, + "loss": 1.3787, + "step": 4870 + }, + { + "epoch": 4.106015986537653, + "grad_norm": 0.8517307639122009, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 4880 + }, + { + "epoch": 4.114429953723181, + "grad_norm": 0.8208953142166138, + "learning_rate": 0.0002, + "loss": 1.4191, + "step": 4890 + }, + { + "epoch": 4.122843920908709, + "grad_norm": 0.8437790870666504, + "learning_rate": 0.0002, + "loss": 1.3189, + "step": 4900 + }, + { + "epoch": 4.131257888094236, + "grad_norm": 0.716672420501709, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4910 + }, + { + "epoch": 4.139671855279764, + "grad_norm": 0.7656235098838806, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 4920 + }, + { + "epoch": 4.148085822465292, + "grad_norm": 0.7209306955337524, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 4930 + }, + { + "epoch": 4.1564997896508205, + "grad_norm": 0.7731267809867859, + "learning_rate": 0.0002, + "loss": 1.3639, + "step": 4940 + }, + { + "epoch": 4.164913756836349, + "grad_norm": 0.7477553486824036, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 4950 + }, + { + "epoch": 4.173327724021877, + "grad_norm": 0.7372981309890747, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 4960 + }, + { + "epoch": 4.181741691207404, + "grad_norm": 0.6582154035568237, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4970 + }, + { + "epoch": 4.190155658392932, + "grad_norm": 0.7003206610679626, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 4980 + }, + { + "epoch": 4.19856962557846, + "grad_norm": 0.735223650932312, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 4990 + }, + { + "epoch": 4.206983592763988, + "grad_norm": 0.7832302451133728, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 5000 + }, + { + "epoch": 4.215397559949516, + "grad_norm": 0.8819546103477478, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5010 + }, + { + "epoch": 4.2238115271350445, + "grad_norm": 0.9325336813926697, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 5020 + }, + { + "epoch": 4.232225494320572, + "grad_norm": 0.7007517218589783, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 5030 + }, + { + "epoch": 4.2406394615061, + "grad_norm": 0.7118321061134338, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 5040 + }, + { + "epoch": 4.249053428691628, + "grad_norm": 0.6578946709632874, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5050 + }, + { + "epoch": 4.257467395877156, + "grad_norm": 0.9438983798027039, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 5060 + }, + { + "epoch": 4.265881363062684, + "grad_norm": 0.703037679195404, + "learning_rate": 0.0002, + "loss": 1.3868, + "step": 5070 + }, + { + "epoch": 4.274295330248212, + "grad_norm": 0.7286025285720825, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 5080 + }, + { + "epoch": 4.28270929743374, + "grad_norm": 0.750689685344696, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5090 + }, + { + "epoch": 4.291123264619268, + "grad_norm": 0.869753360748291, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 5100 + }, + { + "epoch": 4.299537231804796, + "grad_norm": 0.8712980151176453, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 5110 + }, + { + "epoch": 4.307951198990324, + "grad_norm": 0.690263569355011, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 5120 + }, + { + "epoch": 4.316365166175852, + "grad_norm": 0.7114760279655457, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 5130 + }, + { + "epoch": 4.32477913336138, + "grad_norm": 0.7588112354278564, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5140 + }, + { + "epoch": 4.333193100546908, + "grad_norm": 0.7556202411651611, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 5150 + }, + { + "epoch": 4.341607067732435, + "grad_norm": 0.8357610702514648, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5160 + }, + { + "epoch": 4.3500210349179635, + "grad_norm": 0.8054035902023315, + "learning_rate": 0.0002, + "loss": 1.3348, + "step": 5170 + }, + { + "epoch": 4.358435002103492, + "grad_norm": 0.7637107968330383, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5180 + }, + { + "epoch": 4.36684896928902, + "grad_norm": 0.757481038570404, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 5190 + }, + { + "epoch": 4.375262936474548, + "grad_norm": 0.7185863852500916, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5200 + }, + { + "epoch": 4.383676903660076, + "grad_norm": 0.7326455116271973, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 5210 + }, + { + "epoch": 4.392090870845603, + "grad_norm": 0.7980523109436035, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 5220 + }, + { + "epoch": 4.400504838031131, + "grad_norm": 0.8526999354362488, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 5230 + }, + { + "epoch": 4.4089188052166595, + "grad_norm": 0.7012337446212769, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 5240 + }, + { + "epoch": 4.417332772402188, + "grad_norm": 0.8217827677726746, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 5250 + }, + { + "epoch": 4.425746739587716, + "grad_norm": 0.7141005396842957, + "learning_rate": 0.0002, + "loss": 1.3482, + "step": 5260 + }, + { + "epoch": 4.434160706773244, + "grad_norm": 0.7094302177429199, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 5270 + }, + { + "epoch": 4.442574673958772, + "grad_norm": 0.7234613299369812, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 5280 + }, + { + "epoch": 4.450988641144299, + "grad_norm": 0.7530457973480225, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5290 + }, + { + "epoch": 4.459402608329827, + "grad_norm": 0.7300912141799927, + "learning_rate": 0.0002, + "loss": 1.3944, + "step": 5300 + }, + { + "epoch": 4.467816575515355, + "grad_norm": 0.825443685054779, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 5310 + }, + { + "epoch": 4.4762305427008835, + "grad_norm": 0.7559658885002136, + "learning_rate": 0.0002, + "loss": 1.3648, + "step": 5320 + }, + { + "epoch": 4.484644509886412, + "grad_norm": 0.8817561268806458, + "learning_rate": 0.0002, + "loss": 1.4364, + "step": 5330 + }, + { + "epoch": 4.49305847707194, + "grad_norm": 0.8203575611114502, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 5340 + }, + { + "epoch": 4.501472444257468, + "grad_norm": 0.7677690982818604, + "learning_rate": 0.0002, + "loss": 1.3996, + "step": 5350 + }, + { + "epoch": 4.509886411442995, + "grad_norm": 0.657085120677948, + "learning_rate": 0.0002, + "loss": 1.4142, + "step": 5360 + }, + { + "epoch": 4.518300378628523, + "grad_norm": 0.7939504384994507, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 5370 + }, + { + "epoch": 4.526714345814051, + "grad_norm": 0.6971889138221741, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5380 + }, + { + "epoch": 4.535128312999579, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 5390 + }, + { + "epoch": 4.5435422801851075, + "grad_norm": 0.8504858613014221, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 5400 + }, + { + "epoch": 4.551956247370635, + "grad_norm": 0.9134073853492737, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5410 + }, + { + "epoch": 4.560370214556163, + "grad_norm": 0.7765598893165588, + "learning_rate": 0.0002, + "loss": 1.4375, + "step": 5420 + }, + { + "epoch": 4.568784181741691, + "grad_norm": 0.6991009712219238, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 5430 + }, + { + "epoch": 4.577198148927219, + "grad_norm": 0.8393039107322693, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5440 + }, + { + "epoch": 4.585612116112747, + "grad_norm": 0.7685918211936951, + "learning_rate": 0.0002, + "loss": 1.3976, + "step": 5450 + }, + { + "epoch": 4.594026083298275, + "grad_norm": 0.7135679721832275, + "learning_rate": 0.0002, + "loss": 1.3883, + "step": 5460 + }, + { + "epoch": 4.6024400504838034, + "grad_norm": 0.6728870868682861, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 5470 + }, + { + "epoch": 4.610854017669331, + "grad_norm": 0.7139479517936707, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5480 + }, + { + "epoch": 4.619267984854859, + "grad_norm": 0.8476598858833313, + "learning_rate": 0.0002, + "loss": 1.3498, + "step": 5490 + }, + { + "epoch": 4.627681952040387, + "grad_norm": 0.8034361004829407, + "learning_rate": 0.0002, + "loss": 1.3389, + "step": 5500 + }, + { + "epoch": 4.636095919225915, + "grad_norm": 0.7452183961868286, + "learning_rate": 0.0002, + "loss": 1.4179, + "step": 5510 + }, + { + "epoch": 4.644509886411443, + "grad_norm": 0.8394148945808411, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5520 + }, + { + "epoch": 4.652923853596971, + "grad_norm": 0.7480153441429138, + "learning_rate": 0.0002, + "loss": 1.4561, + "step": 5530 + }, + { + "epoch": 4.661337820782499, + "grad_norm": 0.7781714797019958, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5540 + }, + { + "epoch": 4.669751787968027, + "grad_norm": 1.0058213472366333, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 5550 + }, + { + "epoch": 4.678165755153555, + "grad_norm": 0.7403179407119751, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 5560 + }, + { + "epoch": 4.686579722339083, + "grad_norm": 0.7270476818084717, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 5570 + }, + { + "epoch": 4.694993689524611, + "grad_norm": 0.760877788066864, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5580 + }, + { + "epoch": 4.703407656710139, + "grad_norm": 0.8097004890441895, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5590 + }, + { + "epoch": 4.711821623895667, + "grad_norm": 0.9096523523330688, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 5600 + }, + { + "epoch": 4.720235591081195, + "grad_norm": 0.7262444496154785, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5610 + }, + { + "epoch": 4.7286495582667225, + "grad_norm": 0.8207762837409973, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 5620 + }, + { + "epoch": 4.737063525452251, + "grad_norm": 0.8089601993560791, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 5630 + }, + { + "epoch": 4.745477492637779, + "grad_norm": 0.7609543800354004, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 5640 + }, + { + "epoch": 4.753891459823307, + "grad_norm": 0.7273501753807068, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 5650 + }, + { + "epoch": 4.762305427008835, + "grad_norm": 0.7800219058990479, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5660 + }, + { + "epoch": 4.770719394194362, + "grad_norm": 0.8558377623558044, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 5670 + }, + { + "epoch": 4.77913336137989, + "grad_norm": 0.7131547927856445, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 5680 + }, + { + "epoch": 4.787547328565418, + "grad_norm": 0.7651025056838989, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 5690 + }, + { + "epoch": 4.7959612957509465, + "grad_norm": 0.8129976391792297, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 5700 + }, + { + "epoch": 4.804375262936475, + "grad_norm": 0.8019895553588867, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 5710 + }, + { + "epoch": 4.812789230122003, + "grad_norm": 0.7692018151283264, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 5720 + }, + { + "epoch": 4.821203197307531, + "grad_norm": 0.6893943548202515, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 5730 + }, + { + "epoch": 4.829617164493058, + "grad_norm": 0.6881810426712036, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5740 + }, + { + "epoch": 4.838031131678586, + "grad_norm": 0.7838267683982849, + "learning_rate": 0.0002, + "loss": 1.4775, + "step": 5750 + }, + { + "epoch": 4.846445098864114, + "grad_norm": 0.727799117565155, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5760 + }, + { + "epoch": 4.8548590660496425, + "grad_norm": 0.7458277344703674, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5770 + }, + { + "epoch": 4.863273033235171, + "grad_norm": 0.903802216053009, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 5780 + }, + { + "epoch": 4.871687000420699, + "grad_norm": 0.7983472347259521, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 5790 + }, + { + "epoch": 4.880100967606227, + "grad_norm": 0.6894361972808838, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 5800 + }, + { + "epoch": 4.888514934791754, + "grad_norm": 0.7499409317970276, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 5810 + }, + { + "epoch": 4.896928901977282, + "grad_norm": 0.7362820506095886, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 5820 + }, + { + "epoch": 4.90534286916281, + "grad_norm": 0.8341619968414307, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 5830 + }, + { + "epoch": 4.913756836348338, + "grad_norm": 0.9604470133781433, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5840 + }, + { + "epoch": 4.9221708035338665, + "grad_norm": 0.8916844129562378, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5850 + }, + { + "epoch": 4.930584770719394, + "grad_norm": 0.8519647121429443, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5860 + }, + { + "epoch": 4.938998737904922, + "grad_norm": 0.7946906089782715, + "learning_rate": 0.0002, + "loss": 1.424, + "step": 5870 + }, + { + "epoch": 4.94741270509045, + "grad_norm": 0.7843789458274841, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 5880 + }, + { + "epoch": 4.955826672275978, + "grad_norm": 0.707618772983551, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5890 + }, + { + "epoch": 4.964240639461506, + "grad_norm": 0.7704206109046936, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 5900 + }, + { + "epoch": 4.972654606647034, + "grad_norm": 0.7160256505012512, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5910 + }, + { + "epoch": 4.981068573832562, + "grad_norm": 0.7020420432090759, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 5920 + }, + { + "epoch": 4.98948254101809, + "grad_norm": 0.7576286792755127, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 5930 + }, + { + "epoch": 4.997896508203618, + "grad_norm": 0.8573036789894104, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 5940 + }, + { + "epoch": 4.999579301640724, + "eval_loss": 1.9353811740875244, + "eval_runtime": 37.9208, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 5942 + }, + { + "epoch": 5.006310475389146, + "grad_norm": 0.8204267621040344, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5950 + }, + { + "epoch": 5.014724442574674, + "grad_norm": 0.976840615272522, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 5960 + }, + { + "epoch": 5.023138409760202, + "grad_norm": 0.8765613436698914, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5970 + }, + { + "epoch": 5.03155237694573, + "grad_norm": 1.1793042421340942, + "learning_rate": 0.0002, + "loss": 1.2748, + "step": 5980 + }, + { + "epoch": 5.039966344131258, + "grad_norm": 0.971062958240509, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 5990 + }, + { + "epoch": 5.0483803113167856, + "grad_norm": 0.8649757504463196, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 6000 + }, + { + "epoch": 5.056794278502314, + "grad_norm": 0.9563034176826477, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 6010 + }, + { + "epoch": 5.065208245687842, + "grad_norm": 1.0093994140625, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 6020 + }, + { + "epoch": 5.07362221287337, + "grad_norm": 1.004213571548462, + "learning_rate": 0.0002, + "loss": 1.2519, + "step": 6030 + }, + { + "epoch": 5.082036180058898, + "grad_norm": 0.8307787179946899, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 6040 + }, + { + "epoch": 5.090450147244426, + "grad_norm": 0.9117848873138428, + "learning_rate": 0.0002, + "loss": 1.2282, + "step": 6050 + }, + { + "epoch": 5.098864114429953, + "grad_norm": 1.0269840955734253, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6060 + }, + { + "epoch": 5.1072780816154815, + "grad_norm": 0.9079542756080627, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6070 + }, + { + "epoch": 5.11569204880101, + "grad_norm": 0.885702908039093, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6080 + }, + { + "epoch": 5.124106015986538, + "grad_norm": 0.9976128339767456, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 6090 + }, + { + "epoch": 5.132519983172066, + "grad_norm": 0.8472117185592651, + "learning_rate": 0.0002, + "loss": 1.3082, + "step": 6100 + }, + { + "epoch": 5.140933950357594, + "grad_norm": 1.0385161638259888, + "learning_rate": 0.0002, + "loss": 1.226, + "step": 6110 + }, + { + "epoch": 5.149347917543121, + "grad_norm": 0.8948383927345276, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 6120 + }, + { + "epoch": 5.157761884728649, + "grad_norm": 1.2613716125488281, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6130 + }, + { + "epoch": 5.166175851914177, + "grad_norm": 0.9933410286903381, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6140 + }, + { + "epoch": 5.1745898190997055, + "grad_norm": 0.9673663973808289, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6150 + }, + { + "epoch": 5.183003786285234, + "grad_norm": 0.9969648122787476, + "learning_rate": 0.0002, + "loss": 1.2947, + "step": 6160 + }, + { + "epoch": 5.191417753470762, + "grad_norm": 1.2163258790969849, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 6170 + }, + { + "epoch": 5.19983172065629, + "grad_norm": 0.9163419604301453, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 6180 + }, + { + "epoch": 5.208245687841817, + "grad_norm": 0.9225585460662842, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6190 + }, + { + "epoch": 5.216659655027345, + "grad_norm": 0.9205296635627747, + "learning_rate": 0.0002, + "loss": 1.2932, + "step": 6200 + }, + { + "epoch": 5.225073622212873, + "grad_norm": 1.0655443668365479, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 6210 + }, + { + "epoch": 5.233487589398401, + "grad_norm": 1.0854865312576294, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6220 + }, + { + "epoch": 5.2419015565839295, + "grad_norm": 0.8489186763763428, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 6230 + }, + { + "epoch": 5.250315523769458, + "grad_norm": 0.910391628742218, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 6240 + }, + { + "epoch": 5.258729490954985, + "grad_norm": 0.925507128238678, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 6250 + }, + { + "epoch": 5.267143458140513, + "grad_norm": 1.1069735288619995, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 6260 + }, + { + "epoch": 5.275557425326041, + "grad_norm": 0.9705119132995605, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 6270 + }, + { + "epoch": 5.283971392511569, + "grad_norm": 0.9752426147460938, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6280 + }, + { + "epoch": 5.292385359697097, + "grad_norm": 1.021359920501709, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6290 + }, + { + "epoch": 5.3007993268826255, + "grad_norm": 1.148606300354004, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6300 + }, + { + "epoch": 5.309213294068153, + "grad_norm": 0.8909247517585754, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 6310 + }, + { + "epoch": 5.317627261253681, + "grad_norm": 0.9879156351089478, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 6320 + }, + { + "epoch": 5.326041228439209, + "grad_norm": 0.9473357200622559, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6330 + }, + { + "epoch": 5.334455195624737, + "grad_norm": 1.1422028541564941, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6340 + }, + { + "epoch": 5.342869162810265, + "grad_norm": 0.9942235350608826, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6350 + }, + { + "epoch": 5.351283129995793, + "grad_norm": 0.9535723924636841, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 6360 + }, + { + "epoch": 5.359697097181321, + "grad_norm": 0.9020892381668091, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6370 + }, + { + "epoch": 5.368111064366849, + "grad_norm": 1.0626472234725952, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6380 + }, + { + "epoch": 5.376525031552377, + "grad_norm": 1.1395848989486694, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 6390 + }, + { + "epoch": 5.384938998737905, + "grad_norm": 0.9274451732635498, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 6400 + }, + { + "epoch": 5.393352965923433, + "grad_norm": 0.8108699917793274, + "learning_rate": 0.0002, + "loss": 1.2819, + "step": 6410 + }, + { + "epoch": 5.401766933108961, + "grad_norm": 1.1805564165115356, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6420 + }, + { + "epoch": 5.410180900294489, + "grad_norm": 0.8321298360824585, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 6430 + }, + { + "epoch": 5.418594867480017, + "grad_norm": 0.8981925249099731, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 6440 + }, + { + "epoch": 5.4270088346655445, + "grad_norm": 1.0730986595153809, + "learning_rate": 0.0002, + "loss": 1.258, + "step": 6450 + }, + { + "epoch": 5.435422801851073, + "grad_norm": 1.0584609508514404, + "learning_rate": 0.0002, + "loss": 1.26, + "step": 6460 + }, + { + "epoch": 5.443836769036601, + "grad_norm": 1.0792299509048462, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 6470 + }, + { + "epoch": 5.452250736222129, + "grad_norm": 0.9101872444152832, + "learning_rate": 0.0002, + "loss": 1.2035, + "step": 6480 + }, + { + "epoch": 5.460664703407657, + "grad_norm": 0.9910100698471069, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6490 + }, + { + "epoch": 5.469078670593185, + "grad_norm": 1.041412353515625, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6500 + }, + { + "epoch": 5.477492637778712, + "grad_norm": 1.0091687440872192, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 6510 + }, + { + "epoch": 5.48590660496424, + "grad_norm": 0.8755383491516113, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6520 + }, + { + "epoch": 5.4943205721497685, + "grad_norm": 0.980212390422821, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 6530 + }, + { + "epoch": 5.502734539335297, + "grad_norm": 0.9356869459152222, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6540 + }, + { + "epoch": 5.511148506520825, + "grad_norm": 0.9008095264434814, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6550 + }, + { + "epoch": 5.519562473706353, + "grad_norm": 0.8908938765525818, + "learning_rate": 0.0002, + "loss": 1.2818, + "step": 6560 + }, + { + "epoch": 5.52797644089188, + "grad_norm": 1.1423932313919067, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6570 + }, + { + "epoch": 5.536390408077408, + "grad_norm": 1.0508161783218384, + "learning_rate": 0.0002, + "loss": 1.3039, + "step": 6580 + }, + { + "epoch": 5.544804375262936, + "grad_norm": 0.8357517719268799, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6590 + }, + { + "epoch": 5.5532183424484645, + "grad_norm": 0.9892540574073792, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6600 + }, + { + "epoch": 5.561632309633993, + "grad_norm": 1.0048326253890991, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6610 + }, + { + "epoch": 5.570046276819521, + "grad_norm": 0.9801995158195496, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 6620 + }, + { + "epoch": 5.578460244005049, + "grad_norm": 0.9899214506149292, + "learning_rate": 0.0002, + "loss": 1.2606, + "step": 6630 + }, + { + "epoch": 5.586874211190576, + "grad_norm": 1.1911814212799072, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6640 + }, + { + "epoch": 5.595288178376104, + "grad_norm": 1.0368894338607788, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 6650 + }, + { + "epoch": 5.603702145561632, + "grad_norm": 1.1248382329940796, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 6660 + }, + { + "epoch": 5.61211611274716, + "grad_norm": 0.9765539765357971, + "learning_rate": 0.0002, + "loss": 1.2548, + "step": 6670 + }, + { + "epoch": 5.6205300799326885, + "grad_norm": 0.9810206890106201, + "learning_rate": 0.0002, + "loss": 1.3451, + "step": 6680 + }, + { + "epoch": 5.628944047118217, + "grad_norm": 1.100386619567871, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6690 + }, + { + "epoch": 5.637358014303744, + "grad_norm": 0.8824519515037537, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6700 + }, + { + "epoch": 5.645771981489272, + "grad_norm": 1.0864064693450928, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6710 + }, + { + "epoch": 5.6541859486748, + "grad_norm": 1.1614511013031006, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 6720 + }, + { + "epoch": 5.662599915860328, + "grad_norm": 1.0762972831726074, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 6730 + }, + { + "epoch": 5.671013883045856, + "grad_norm": 0.9408974647521973, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 6740 + }, + { + "epoch": 5.679427850231384, + "grad_norm": 0.8906030058860779, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6750 + }, + { + "epoch": 5.687841817416912, + "grad_norm": 0.9527303576469421, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 6760 + }, + { + "epoch": 5.69625578460244, + "grad_norm": 0.9471196532249451, + "learning_rate": 0.0002, + "loss": 1.322, + "step": 6770 + }, + { + "epoch": 5.704669751787968, + "grad_norm": 0.9186838865280151, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6780 + }, + { + "epoch": 5.713083718973496, + "grad_norm": 0.9225441813468933, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 6790 + }, + { + "epoch": 5.721497686159024, + "grad_norm": 0.9712982773780823, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6800 + }, + { + "epoch": 5.729911653344552, + "grad_norm": 1.0743170976638794, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6810 + }, + { + "epoch": 5.73832562053008, + "grad_norm": 1.2738113403320312, + "learning_rate": 0.0002, + "loss": 1.2136, + "step": 6820 + }, + { + "epoch": 5.7467395877156076, + "grad_norm": 0.9386790990829468, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6830 + }, + { + "epoch": 5.755153554901136, + "grad_norm": 1.0817769765853882, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 6840 + }, + { + "epoch": 5.763567522086664, + "grad_norm": 1.1040263175964355, + "learning_rate": 0.0002, + "loss": 1.2247, + "step": 6850 + }, + { + "epoch": 5.771981489272192, + "grad_norm": 1.0656492710113525, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6860 + }, + { + "epoch": 5.78039545645772, + "grad_norm": 0.9550157189369202, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 6870 + }, + { + "epoch": 5.788809423643248, + "grad_norm": 1.0130870342254639, + "learning_rate": 0.0002, + "loss": 1.3201, + "step": 6880 + }, + { + "epoch": 5.797223390828776, + "grad_norm": 1.0675787925720215, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 6890 + }, + { + "epoch": 5.8056373580143035, + "grad_norm": 0.9537774920463562, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6900 + }, + { + "epoch": 5.814051325199832, + "grad_norm": 0.9640319347381592, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6910 + }, + { + "epoch": 5.82246529238536, + "grad_norm": 0.8917992115020752, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 6920 + }, + { + "epoch": 5.830879259570888, + "grad_norm": 0.9881822466850281, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 6930 + }, + { + "epoch": 5.839293226756416, + "grad_norm": 0.9136882424354553, + "learning_rate": 0.0002, + "loss": 1.323, + "step": 6940 + }, + { + "epoch": 5.847707193941943, + "grad_norm": 0.9086098074913025, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 6950 + }, + { + "epoch": 5.856121161127471, + "grad_norm": 0.9443018436431885, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6960 + }, + { + "epoch": 5.864535128312999, + "grad_norm": 0.9915381669998169, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 6970 + }, + { + "epoch": 5.8729490954985275, + "grad_norm": 0.8939146995544434, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 6980 + }, + { + "epoch": 5.881363062684056, + "grad_norm": 1.3672245740890503, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 6990 + }, + { + "epoch": 5.889777029869584, + "grad_norm": 1.0116257667541504, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 7000 + }, + { + "epoch": 5.898190997055112, + "grad_norm": 1.1561565399169922, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 7010 + }, + { + "epoch": 5.906604964240639, + "grad_norm": 0.9900678992271423, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 7020 + }, + { + "epoch": 5.915018931426167, + "grad_norm": 0.9297345876693726, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 7030 + }, + { + "epoch": 5.923432898611695, + "grad_norm": 0.9357825517654419, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 7040 + }, + { + "epoch": 5.931846865797223, + "grad_norm": 1.049317717552185, + "learning_rate": 0.0002, + "loss": 1.2303, + "step": 7050 + }, + { + "epoch": 5.9402608329827515, + "grad_norm": 0.950633704662323, + "learning_rate": 0.0002, + "loss": 1.3243, + "step": 7060 + }, + { + "epoch": 5.94867480016828, + "grad_norm": 0.854581892490387, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 7070 + }, + { + "epoch": 5.957088767353808, + "grad_norm": 0.9097039699554443, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 7080 + }, + { + "epoch": 5.965502734539335, + "grad_norm": 0.9072173237800598, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 7090 + }, + { + "epoch": 5.973916701724863, + "grad_norm": 1.0470727682113647, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 7100 + }, + { + "epoch": 5.982330668910391, + "grad_norm": 1.2628462314605713, + "learning_rate": 0.0002, + "loss": 1.3324, + "step": 7110 + }, + { + "epoch": 5.990744636095919, + "grad_norm": 1.055279016494751, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 7120 + }, + { + "epoch": 5.9991586032814475, + "grad_norm": 0.966194212436676, + "learning_rate": 0.0002, + "loss": 1.3234, + "step": 7130 + }, + { + "epoch": 6.0, + "eval_loss": 2.0427448749542236, + "eval_runtime": 37.8426, + "eval_samples_per_second": 13.609, + "eval_steps_per_second": 1.718, + "step": 7131 + }, + { + "epoch": 6.007572570466976, + "grad_norm": 1.4037928581237793, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 7140 + }, + { + "epoch": 6.015986537652503, + "grad_norm": 1.1081010103225708, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 7150 + }, + { + "epoch": 6.024400504838031, + "grad_norm": 1.1585499048233032, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 7160 + }, + { + "epoch": 6.032814472023559, + "grad_norm": 1.0822780132293701, + "learning_rate": 0.0002, + "loss": 1.0192, + "step": 7170 + }, + { + "epoch": 6.041228439209087, + "grad_norm": 0.9662094712257385, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 7180 + }, + { + "epoch": 6.049642406394615, + "grad_norm": 1.063936710357666, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 7190 + }, + { + "epoch": 6.058056373580143, + "grad_norm": 1.0349032878875732, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 7200 + }, + { + "epoch": 6.066470340765671, + "grad_norm": 1.0312575101852417, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 7210 + }, + { + "epoch": 6.074884307951199, + "grad_norm": 1.1942846775054932, + "learning_rate": 0.0002, + "loss": 1.1824, + "step": 7220 + }, + { + "epoch": 6.083298275136727, + "grad_norm": 1.0816049575805664, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 7230 + }, + { + "epoch": 6.091712242322255, + "grad_norm": 0.9985513687133789, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7240 + }, + { + "epoch": 6.100126209507783, + "grad_norm": 1.2573972940444946, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 7250 + }, + { + "epoch": 6.108540176693311, + "grad_norm": 1.1182395219802856, + "learning_rate": 0.0002, + "loss": 1.1051, + "step": 7260 + }, + { + "epoch": 6.116954143878839, + "grad_norm": 0.9679344296455383, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7270 + }, + { + "epoch": 6.1253681110643665, + "grad_norm": 1.0913981199264526, + "learning_rate": 0.0002, + "loss": 1.1192, + "step": 7280 + }, + { + "epoch": 6.133782078249895, + "grad_norm": 1.1291013956069946, + "learning_rate": 0.0002, + "loss": 1.0411, + "step": 7290 + }, + { + "epoch": 6.142196045435423, + "grad_norm": 1.2679595947265625, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7300 + }, + { + "epoch": 6.150610012620951, + "grad_norm": 1.2350026369094849, + "learning_rate": 0.0002, + "loss": 1.0875, + "step": 7310 + }, + { + "epoch": 6.159023979806479, + "grad_norm": 1.3213104009628296, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 7320 + }, + { + "epoch": 6.167437946992007, + "grad_norm": 1.1924850940704346, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 7330 + }, + { + "epoch": 6.175851914177534, + "grad_norm": 1.1890000104904175, + "learning_rate": 0.0002, + "loss": 1.1242, + "step": 7340 + }, + { + "epoch": 6.184265881363062, + "grad_norm": 1.3821455240249634, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 7350 + }, + { + "epoch": 6.1926798485485905, + "grad_norm": 1.1217057704925537, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 7360 + }, + { + "epoch": 6.201093815734119, + "grad_norm": 1.2441548109054565, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 7370 + }, + { + "epoch": 6.209507782919647, + "grad_norm": 1.0837615728378296, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 7380 + }, + { + "epoch": 6.217921750105175, + "grad_norm": 1.164304256439209, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 7390 + }, + { + "epoch": 6.226335717290702, + "grad_norm": 1.3129467964172363, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 7400 + }, + { + "epoch": 6.23474968447623, + "grad_norm": 1.1938153505325317, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 7410 + }, + { + "epoch": 6.243163651661758, + "grad_norm": 1.4348443746566772, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 7420 + }, + { + "epoch": 6.2515776188472865, + "grad_norm": 1.132301926612854, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 7430 + }, + { + "epoch": 6.259991586032815, + "grad_norm": 1.136966586112976, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 7440 + }, + { + "epoch": 6.268405553218343, + "grad_norm": 1.12801194190979, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 7450 + }, + { + "epoch": 6.276819520403871, + "grad_norm": 1.0246902704238892, + "learning_rate": 0.0002, + "loss": 1.0408, + "step": 7460 + }, + { + "epoch": 6.285233487589398, + "grad_norm": 1.1066974401474, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 7470 + }, + { + "epoch": 6.293647454774926, + "grad_norm": 1.012710690498352, + "learning_rate": 0.0002, + "loss": 1.1589, + "step": 7480 + }, + { + "epoch": 6.302061421960454, + "grad_norm": 1.2227119207382202, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 7490 + }, + { + "epoch": 6.310475389145982, + "grad_norm": 0.9736923575401306, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 7500 + }, + { + "epoch": 6.3188893563315105, + "grad_norm": 1.2945268154144287, + "learning_rate": 0.0002, + "loss": 1.1017, + "step": 7510 + }, + { + "epoch": 6.327303323517039, + "grad_norm": 1.1579312086105347, + "learning_rate": 0.0002, + "loss": 1.0724, + "step": 7520 + }, + { + "epoch": 6.335717290702567, + "grad_norm": 1.2404558658599854, + "learning_rate": 0.0002, + "loss": 1.0899, + "step": 7530 + }, + { + "epoch": 6.344131257888094, + "grad_norm": 1.4673258066177368, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 7540 + }, + { + "epoch": 6.352545225073622, + "grad_norm": 1.2268997430801392, + "learning_rate": 0.0002, + "loss": 1.128, + "step": 7550 + }, + { + "epoch": 6.36095919225915, + "grad_norm": 0.9772747159004211, + "learning_rate": 0.0002, + "loss": 1.0932, + "step": 7560 + }, + { + "epoch": 6.369373159444678, + "grad_norm": 1.0205204486846924, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 7570 + }, + { + "epoch": 6.377787126630206, + "grad_norm": 1.2227109670639038, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7580 + }, + { + "epoch": 6.3862010938157345, + "grad_norm": 1.0708507299423218, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 7590 + }, + { + "epoch": 6.394615061001262, + "grad_norm": 1.1427522897720337, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 7600 + }, + { + "epoch": 6.40302902818679, + "grad_norm": 1.0706431865692139, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 7610 + }, + { + "epoch": 6.411442995372318, + "grad_norm": 1.1358282566070557, + "learning_rate": 0.0002, + "loss": 1.0933, + "step": 7620 + }, + { + "epoch": 6.419856962557846, + "grad_norm": 1.4011822938919067, + "learning_rate": 0.0002, + "loss": 1.1075, + "step": 7630 + }, + { + "epoch": 6.428270929743374, + "grad_norm": 1.5616450309753418, + "learning_rate": 0.0002, + "loss": 1.1269, + "step": 7640 + }, + { + "epoch": 6.436684896928902, + "grad_norm": 1.1442687511444092, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 7650 + }, + { + "epoch": 6.44509886411443, + "grad_norm": 1.164803147315979, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 7660 + }, + { + "epoch": 6.453512831299958, + "grad_norm": 1.3184553384780884, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 7670 + }, + { + "epoch": 6.461926798485486, + "grad_norm": 1.2701894044876099, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 7680 + }, + { + "epoch": 6.470340765671014, + "grad_norm": 1.1998416185379028, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 7690 + }, + { + "epoch": 6.478754732856542, + "grad_norm": 1.156459927558899, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 7700 + }, + { + "epoch": 6.48716870004207, + "grad_norm": 1.0217190980911255, + "learning_rate": 0.0002, + "loss": 1.2122, + "step": 7710 + }, + { + "epoch": 6.495582667227598, + "grad_norm": 1.230372428894043, + "learning_rate": 0.0002, + "loss": 1.0917, + "step": 7720 + }, + { + "epoch": 6.5039966344131255, + "grad_norm": 1.105675220489502, + "learning_rate": 0.0002, + "loss": 1.119, + "step": 7730 + }, + { + "epoch": 6.512410601598654, + "grad_norm": 1.1623669862747192, + "learning_rate": 0.0002, + "loss": 1.0758, + "step": 7740 + }, + { + "epoch": 6.520824568784182, + "grad_norm": 1.2884684801101685, + "learning_rate": 0.0002, + "loss": 1.1548, + "step": 7750 + }, + { + "epoch": 6.52923853596971, + "grad_norm": 1.1785279512405396, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 7760 + }, + { + "epoch": 6.537652503155238, + "grad_norm": 1.0607101917266846, + "learning_rate": 0.0002, + "loss": 1.1598, + "step": 7770 + }, + { + "epoch": 6.546066470340766, + "grad_norm": 1.21990168094635, + "learning_rate": 0.0002, + "loss": 1.1472, + "step": 7780 + }, + { + "epoch": 6.554480437526293, + "grad_norm": 1.1498621702194214, + "learning_rate": 0.0002, + "loss": 1.1468, + "step": 7790 + }, + { + "epoch": 6.562894404711821, + "grad_norm": 1.263929009437561, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 7800 + }, + { + "epoch": 6.5713083718973495, + "grad_norm": 1.1580625772476196, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 7810 + }, + { + "epoch": 6.579722339082878, + "grad_norm": 1.4431294202804565, + "learning_rate": 0.0002, + "loss": 1.1313, + "step": 7820 + }, + { + "epoch": 6.588136306268406, + "grad_norm": 1.1309990882873535, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 7830 + }, + { + "epoch": 6.596550273453934, + "grad_norm": 1.0543386936187744, + "learning_rate": 0.0002, + "loss": 1.1156, + "step": 7840 + }, + { + "epoch": 6.604964240639461, + "grad_norm": 1.2180639505386353, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 7850 + }, + { + "epoch": 6.613378207824989, + "grad_norm": 1.0631271600723267, + "learning_rate": 0.0002, + "loss": 1.1318, + "step": 7860 + }, + { + "epoch": 6.621792175010517, + "grad_norm": 1.138885498046875, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 7870 + }, + { + "epoch": 6.630206142196045, + "grad_norm": 1.1117745637893677, + "learning_rate": 0.0002, + "loss": 1.1805, + "step": 7880 + }, + { + "epoch": 6.6386201093815735, + "grad_norm": 1.3734886646270752, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 7890 + }, + { + "epoch": 6.647034076567102, + "grad_norm": 1.236003041267395, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 7900 + }, + { + "epoch": 6.65544804375263, + "grad_norm": 1.2206000089645386, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 7910 + }, + { + "epoch": 6.663862010938157, + "grad_norm": 1.2842656373977661, + "learning_rate": 0.0002, + "loss": 1.1637, + "step": 7920 + }, + { + "epoch": 6.672275978123685, + "grad_norm": 1.2365005016326904, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 7930 + }, + { + "epoch": 6.680689945309213, + "grad_norm": 1.256620168685913, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 7940 + }, + { + "epoch": 6.689103912494741, + "grad_norm": 1.3232917785644531, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 7950 + }, + { + "epoch": 6.6975178796802695, + "grad_norm": 1.2470088005065918, + "learning_rate": 0.0002, + "loss": 1.2042, + "step": 7960 + }, + { + "epoch": 6.705931846865798, + "grad_norm": 1.0511926412582397, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 7970 + }, + { + "epoch": 6.714345814051326, + "grad_norm": 1.107310175895691, + "learning_rate": 0.0002, + "loss": 1.118, + "step": 7980 + }, + { + "epoch": 6.722759781236853, + "grad_norm": 1.4069843292236328, + "learning_rate": 0.0002, + "loss": 1.2109, + "step": 7990 + }, + { + "epoch": 6.731173748422381, + "grad_norm": 1.0800836086273193, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 8000 + }, + { + "epoch": 6.739587715607909, + "grad_norm": 1.1676300764083862, + "learning_rate": 0.0002, + "loss": 1.1824, + "step": 8010 + }, + { + "epoch": 6.748001682793437, + "grad_norm": 1.0579663515090942, + "learning_rate": 0.0002, + "loss": 1.1253, + "step": 8020 + }, + { + "epoch": 6.756415649978965, + "grad_norm": 1.2770029306411743, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 8030 + }, + { + "epoch": 6.764829617164493, + "grad_norm": 1.0981038808822632, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 8040 + }, + { + "epoch": 6.773243584350021, + "grad_norm": 1.1194742918014526, + "learning_rate": 0.0002, + "loss": 1.1422, + "step": 8050 + }, + { + "epoch": 6.781657551535549, + "grad_norm": 1.0130012035369873, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 8060 + }, + { + "epoch": 6.790071518721077, + "grad_norm": 1.2051167488098145, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 8070 + }, + { + "epoch": 6.798485485906605, + "grad_norm": 1.095689058303833, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 8080 + }, + { + "epoch": 6.806899453092133, + "grad_norm": 1.2275174856185913, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 8090 + }, + { + "epoch": 6.815313420277661, + "grad_norm": 1.1439805030822754, + "learning_rate": 0.0002, + "loss": 1.1453, + "step": 8100 + }, + { + "epoch": 6.8237273874631885, + "grad_norm": 1.276331901550293, + "learning_rate": 0.0002, + "loss": 1.1624, + "step": 8110 + }, + { + "epoch": 6.832141354648717, + "grad_norm": 1.0450139045715332, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 8120 + }, + { + "epoch": 6.840555321834245, + "grad_norm": 1.1189453601837158, + "learning_rate": 0.0002, + "loss": 1.1783, + "step": 8130 + }, + { + "epoch": 6.848969289019773, + "grad_norm": 1.194640874862671, + "learning_rate": 0.0002, + "loss": 1.1093, + "step": 8140 + }, + { + "epoch": 6.857383256205301, + "grad_norm": 1.095372200012207, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 8150 + }, + { + "epoch": 6.865797223390829, + "grad_norm": 1.2416104078292847, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 8160 + }, + { + "epoch": 6.874211190576357, + "grad_norm": 1.2402868270874023, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 8170 + }, + { + "epoch": 6.882625157761884, + "grad_norm": 1.1317291259765625, + "learning_rate": 0.0002, + "loss": 1.1306, + "step": 8180 + }, + { + "epoch": 6.8910391249474126, + "grad_norm": 1.0581914186477661, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 8190 + }, + { + "epoch": 6.899453092132941, + "grad_norm": 1.3540890216827393, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 8200 + }, + { + "epoch": 6.907867059318469, + "grad_norm": 1.213672399520874, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 8210 + }, + { + "epoch": 6.916281026503997, + "grad_norm": 1.2654485702514648, + "learning_rate": 0.0002, + "loss": 1.1406, + "step": 8220 + }, + { + "epoch": 6.924694993689524, + "grad_norm": 1.203903317451477, + "learning_rate": 0.0002, + "loss": 1.205, + "step": 8230 + }, + { + "epoch": 6.933108960875052, + "grad_norm": 1.1332030296325684, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 8240 + }, + { + "epoch": 6.94152292806058, + "grad_norm": 1.2699192762374878, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 8250 + }, + { + "epoch": 6.9499368952461085, + "grad_norm": 1.2728958129882812, + "learning_rate": 0.0002, + "loss": 1.1831, + "step": 8260 + }, + { + "epoch": 6.958350862431637, + "grad_norm": 1.238410472869873, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 8270 + }, + { + "epoch": 6.966764829617165, + "grad_norm": 1.403863549232483, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 8280 + }, + { + "epoch": 6.975178796802693, + "grad_norm": 1.1096396446228027, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 8290 + }, + { + "epoch": 6.98359276398822, + "grad_norm": 1.1043379306793213, + "learning_rate": 0.0002, + "loss": 1.2049, + "step": 8300 + }, + { + "epoch": 6.992006731173748, + "grad_norm": 1.391754388809204, + "learning_rate": 0.0002, + "loss": 1.1255, + "step": 8310 + }, + { + "epoch": 6.999579301640724, + "eval_loss": 2.1421656608581543, + "eval_runtime": 37.8262, + "eval_samples_per_second": 13.615, + "eval_steps_per_second": 1.718, + "step": 8319 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.850078318850212e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-8319/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..877a1445c41083bf3775e855f5f8d04a996d35e4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1550273ea68bc5a4e788e74c1c78bf524ec1b61043bf5f20b357927ff807a309 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c78b121fcf1c09de960a5ad8805a36487a4ebbd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50e70ec2342f84423f0873190aa676fd8b4c8f338b92928987c390474e1f228 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..466d6085e3ccf304eda44eb2738464a7b9bafcd5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92900cc70e521535f52ac8e843c2753bcf71d5073db21bdc901368b594c00657 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..affbf7ce62b8cef2cd803c35235c37107c3a7913 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f92060c6b01adb16ebce9a8e91b1e7112c58c4535a075036e50137831397c75 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6be0e4e303145eb3bbd12fbdaf5eb56198a31cd4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/trainer_state.json @@ -0,0 +1,6747 @@ +{ + "best_metric": 1.8055059909820557, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", + "epoch": 7.996634413125789, + "eval_steps": 10, + "global_step": 9504, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008413967185527976, + "grad_norm": 0.5458821654319763, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 10 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.7293308973312378, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 20 + }, + { + "epoch": 0.02524190155658393, + "grad_norm": 0.47792306542396545, + "learning_rate": 0.0002, + "loss": 2.0815, + "step": 30 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5944402813911438, + "learning_rate": 0.0002, + "loss": 1.9718, + "step": 40 + }, + { + "epoch": 0.04206983592763988, + "grad_norm": 0.5415359735488892, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 50 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.535713791847229, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 60 + }, + { + "epoch": 0.058897770298695834, + "grad_norm": 0.5184146761894226, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 70 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.458926796913147, + "learning_rate": 0.0002, + "loss": 1.8396, + "step": 80 + }, + { + "epoch": 0.07572570466975179, + "grad_norm": 0.4780142307281494, + "learning_rate": 0.0002, + "loss": 1.8677, + "step": 90 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.79965740442276, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 100 + }, + { + "epoch": 0.09255363904080774, + "grad_norm": 0.4498862028121948, + "learning_rate": 0.0002, + "loss": 1.9081, + "step": 110 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.39338430762290955, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 120 + }, + { + "epoch": 0.10938157341186369, + "grad_norm": 0.9588953852653503, + "learning_rate": 0.0002, + "loss": 1.8637, + "step": 130 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.41675639152526855, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 140 + }, + { + "epoch": 0.12620950778291964, + "grad_norm": 0.44519832730293274, + "learning_rate": 0.0002, + "loss": 1.8904, + "step": 150 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4176260530948639, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 160 + }, + { + "epoch": 0.1430374421539756, + "grad_norm": 0.35840365290641785, + "learning_rate": 0.0002, + "loss": 1.8398, + "step": 170 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.3794495463371277, + "learning_rate": 0.0002, + "loss": 1.8666, + "step": 180 + }, + { + "epoch": 0.15986537652503155, + "grad_norm": 0.4563522934913635, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 190 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0002, + "loss": 1.8893, + "step": 200 + }, + { + "epoch": 0.1766933108960875, + "grad_norm": 0.44081518054008484, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 210 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.46078577637672424, + "learning_rate": 0.0002, + "loss": 1.9048, + "step": 220 + }, + { + "epoch": 0.19352124526714345, + "grad_norm": 0.36132094264030457, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 230 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.3747289180755615, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 240 + }, + { + "epoch": 0.2103491796381994, + "grad_norm": 0.3540179133415222, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 250 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.3461375832557678, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 260 + }, + { + "epoch": 0.22717711400925536, + "grad_norm": 0.3436960279941559, + "learning_rate": 0.0002, + "loss": 1.8509, + "step": 270 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.35403719544410706, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 280 + }, + { + "epoch": 0.2440050483803113, + "grad_norm": 0.37142616510391235, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 290 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.3307955861091614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 300 + }, + { + "epoch": 0.2608329827513673, + "grad_norm": 0.32855314016342163, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 310 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.3299003839492798, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 320 + }, + { + "epoch": 0.27766091712242325, + "grad_norm": 0.44311287999153137, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 330 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.32989758253097534, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 340 + }, + { + "epoch": 0.2944888514934792, + "grad_norm": 0.34400200843811035, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 350 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.36286211013793945, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 360 + }, + { + "epoch": 0.31131678586453515, + "grad_norm": 0.406827837228775, + "learning_rate": 0.0002, + "loss": 1.8025, + "step": 370 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.36299195885658264, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.3281447202355911, + "grad_norm": 0.3477257192134857, + "learning_rate": 0.0002, + "loss": 1.837, + "step": 390 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.3730369210243225, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 400 + }, + { + "epoch": 0.34497265460664706, + "grad_norm": 0.4644559919834137, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 410 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.406576544046402, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 420 + }, + { + "epoch": 0.361800588977703, + "grad_norm": 0.3612699508666992, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 430 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.3243742287158966, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.37862852334875896, + "grad_norm": 0.36671221256256104, + "learning_rate": 0.0002, + "loss": 1.8851, + "step": 450 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.3565002381801605, + "learning_rate": 0.0002, + "loss": 1.8853, + "step": 460 + }, + { + "epoch": 0.3954564577198149, + "grad_norm": 0.34630221128463745, + "learning_rate": 0.0002, + "loss": 1.8923, + "step": 470 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.3353537321090698, + "learning_rate": 0.0002, + "loss": 1.8234, + "step": 480 + }, + { + "epoch": 0.41228439209087087, + "grad_norm": 0.4015921950340271, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 490 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.5489419102668762, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 500 + }, + { + "epoch": 0.4291123264619268, + "grad_norm": 0.4193589985370636, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 510 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.3418922424316406, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 520 + }, + { + "epoch": 0.44594026083298277, + "grad_norm": 0.32668185234069824, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 530 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3094325661659241, + "learning_rate": 0.0002, + "loss": 1.7501, + "step": 540 + }, + { + "epoch": 0.4627681952040387, + "grad_norm": 0.3743017315864563, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 550 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.3295630216598511, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 560 + }, + { + "epoch": 0.4795961295750947, + "grad_norm": 1.6124513149261475, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 570 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.3245585858821869, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 580 + }, + { + "epoch": 0.49642406394615063, + "grad_norm": 0.3332934081554413, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 590 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3836138844490051, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 600 + }, + { + "epoch": 0.5132519983172066, + "grad_norm": 0.32953888177871704, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 610 + }, + { + "epoch": 0.5216659655027346, + "grad_norm": 0.36291512846946716, + "learning_rate": 0.0002, + "loss": 1.7729, + "step": 620 + }, + { + "epoch": 0.5300799326882625, + "grad_norm": 0.3237783908843994, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 630 + }, + { + "epoch": 0.5384938998737905, + "grad_norm": 0.38882696628570557, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 640 + }, + { + "epoch": 0.5469078670593185, + "grad_norm": 0.37821972370147705, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 650 + }, + { + "epoch": 0.5553218342448465, + "grad_norm": 0.3556285500526428, + "learning_rate": 0.0002, + "loss": 1.8075, + "step": 660 + }, + { + "epoch": 0.5637358014303744, + "grad_norm": 0.347499281167984, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 670 + }, + { + "epoch": 0.5721497686159024, + "grad_norm": 0.3176489472389221, + "learning_rate": 0.0002, + "loss": 1.8066, + "step": 680 + }, + { + "epoch": 0.5805637358014304, + "grad_norm": 0.30220088362693787, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 690 + }, + { + "epoch": 0.5889777029869584, + "grad_norm": 0.3711601793766022, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 700 + }, + { + "epoch": 0.5973916701724863, + "grad_norm": 0.3311759829521179, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 710 + }, + { + "epoch": 0.6058056373580143, + "grad_norm": 0.34824270009994507, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 720 + }, + { + "epoch": 0.6142196045435423, + "grad_norm": 0.29668381810188293, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 730 + }, + { + "epoch": 0.6226335717290703, + "grad_norm": 0.36087489128112793, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 740 + }, + { + "epoch": 0.6310475389145982, + "grad_norm": 0.31590089201927185, + "learning_rate": 0.0002, + "loss": 1.7956, + "step": 750 + }, + { + "epoch": 0.6394615061001262, + "grad_norm": 0.37632957100868225, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 760 + }, + { + "epoch": 0.6478754732856542, + "grad_norm": 0.3360748589038849, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 770 + }, + { + "epoch": 0.6562894404711822, + "grad_norm": 0.3420640528202057, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 780 + }, + { + "epoch": 0.6647034076567101, + "grad_norm": 0.5734959244728088, + "learning_rate": 0.0002, + "loss": 1.8353, + "step": 790 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 0.36440837383270264, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 800 + }, + { + "epoch": 0.6815313420277661, + "grad_norm": 0.3179708421230316, + "learning_rate": 0.0002, + "loss": 1.7532, + "step": 810 + }, + { + "epoch": 0.6899453092132941, + "grad_norm": 0.34122881293296814, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 820 + }, + { + "epoch": 0.698359276398822, + "grad_norm": 0.31886112689971924, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 830 + }, + { + "epoch": 0.70677324358435, + "grad_norm": 0.31782326102256775, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 840 + }, + { + "epoch": 0.715187210769878, + "grad_norm": 0.36052989959716797, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 850 + }, + { + "epoch": 0.723601177955406, + "grad_norm": 0.28946155309677124, + "learning_rate": 0.0002, + "loss": 1.7891, + "step": 860 + }, + { + "epoch": 0.7320151451409339, + "grad_norm": 0.3095663785934448, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 870 + }, + { + "epoch": 0.7404291123264619, + "grad_norm": 0.3317491412162781, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 880 + }, + { + "epoch": 0.7488430795119899, + "grad_norm": 0.31324660778045654, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 890 + }, + { + "epoch": 0.7572570466975179, + "grad_norm": 0.3290475606918335, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 900 + }, + { + "epoch": 0.7656710138830458, + "grad_norm": 0.35690343379974365, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 910 + }, + { + "epoch": 0.7740849810685738, + "grad_norm": 0.39558273553848267, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 920 + }, + { + "epoch": 0.7824989482541018, + "grad_norm": 0.34254348278045654, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 930 + }, + { + "epoch": 0.7909129154396298, + "grad_norm": 0.3560165464878082, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 940 + }, + { + "epoch": 0.7993268826251577, + "grad_norm": 0.30693164467811584, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 950 + }, + { + "epoch": 0.8077408498106857, + "grad_norm": 0.3394823372364044, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 960 + }, + { + "epoch": 0.8161548169962137, + "grad_norm": 0.3741514980792999, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 970 + }, + { + "epoch": 0.8245687841817417, + "grad_norm": 0.3655228316783905, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 980 + }, + { + "epoch": 0.8329827513672696, + "grad_norm": 0.3586033880710602, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 990 + }, + { + "epoch": 0.8413967185527976, + "grad_norm": 0.3459678888320923, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1000 + }, + { + "epoch": 0.8498106857383256, + "grad_norm": 0.3184349834918976, + "learning_rate": 0.0002, + "loss": 1.8498, + "step": 1010 + }, + { + "epoch": 0.8582246529238536, + "grad_norm": 0.3099786043167114, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 1020 + }, + { + "epoch": 0.8666386201093815, + "grad_norm": 0.30300915241241455, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 1030 + }, + { + "epoch": 0.8750525872949095, + "grad_norm": 0.3128705620765686, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 1040 + }, + { + "epoch": 0.8834665544804375, + "grad_norm": 0.3336263597011566, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1050 + }, + { + "epoch": 0.8918805216659655, + "grad_norm": 0.3801328241825104, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 1060 + }, + { + "epoch": 0.9002944888514934, + "grad_norm": 0.3122096359729767, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 1070 + }, + { + "epoch": 0.9087084560370214, + "grad_norm": 0.35990869998931885, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 1080 + }, + { + "epoch": 0.9171224232225494, + "grad_norm": 0.3321819305419922, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1090 + }, + { + "epoch": 0.9255363904080774, + "grad_norm": 0.4202139377593994, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1100 + }, + { + "epoch": 0.9339503575936053, + "grad_norm": 0.32559722661972046, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 1110 + }, + { + "epoch": 0.9423643247791333, + "grad_norm": 0.3098459839820862, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 1120 + }, + { + "epoch": 0.9507782919646613, + "grad_norm": 0.33917108178138733, + "learning_rate": 0.0002, + "loss": 1.8252, + "step": 1130 + }, + { + "epoch": 0.9591922591501894, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1140 + }, + { + "epoch": 0.9676062263357172, + "grad_norm": 0.32508623600006104, + "learning_rate": 0.0002, + "loss": 1.8259, + "step": 1150 + }, + { + "epoch": 0.9760201935212452, + "grad_norm": 0.30150601267814636, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9844341607067733, + "grad_norm": 0.3042563199996948, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 1170 + }, + { + "epoch": 0.9928481278923013, + "grad_norm": 0.33254584670066833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1180 + }, + { + "epoch": 0.9995793016407236, + "eval_loss": 1.8077726364135742, + "eval_runtime": 38.4359, + "eval_samples_per_second": 13.399, + "eval_steps_per_second": 1.691, + "step": 1188 + }, + { + "epoch": 1.0012620950778293, + "grad_norm": 0.35073035955429077, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 1190 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 0.3217269778251648, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1200 + }, + { + "epoch": 1.018090029448885, + "grad_norm": 0.3635033369064331, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1210 + }, + { + "epoch": 1.0265039966344132, + "grad_norm": 0.32468414306640625, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 1220 + }, + { + "epoch": 1.034917963819941, + "grad_norm": 0.3307163417339325, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 1230 + }, + { + "epoch": 1.0433319310054692, + "grad_norm": 0.34381359815597534, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 1240 + }, + { + "epoch": 1.051745898190997, + "grad_norm": 0.35874804854393005, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1250 + }, + { + "epoch": 1.060159865376525, + "grad_norm": 0.3615919351577759, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1260 + }, + { + "epoch": 1.068573832562053, + "grad_norm": 0.32835808396339417, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1270 + }, + { + "epoch": 1.076987799747581, + "grad_norm": 0.3876388370990753, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1280 + }, + { + "epoch": 1.0854017669331089, + "grad_norm": 0.39895930886268616, + "learning_rate": 0.0002, + "loss": 1.7442, + "step": 1290 + }, + { + "epoch": 1.093815734118637, + "grad_norm": 0.39081698656082153, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 1300 + }, + { + "epoch": 1.1022297013041649, + "grad_norm": 0.39974215626716614, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1310 + }, + { + "epoch": 1.110643668489693, + "grad_norm": 0.3887332081794739, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1320 + }, + { + "epoch": 1.1190576356752209, + "grad_norm": 0.36216408014297485, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1330 + }, + { + "epoch": 1.1274716028607488, + "grad_norm": 0.36979028582572937, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1340 + }, + { + "epoch": 1.1358855700462769, + "grad_norm": 0.34052133560180664, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 1350 + }, + { + "epoch": 1.1442995372318048, + "grad_norm": 0.3467716574668884, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1360 + }, + { + "epoch": 1.1527135044173327, + "grad_norm": 0.35528799891471863, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1370 + }, + { + "epoch": 1.1611274716028608, + "grad_norm": 0.36282262206077576, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 1380 + }, + { + "epoch": 1.1695414387883887, + "grad_norm": 0.37355899810791016, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1390 + }, + { + "epoch": 1.1779554059739168, + "grad_norm": 0.37292736768722534, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1400 + }, + { + "epoch": 1.1863693731594447, + "grad_norm": 0.5892812013626099, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1410 + }, + { + "epoch": 1.1947833403449726, + "grad_norm": 0.3712292015552521, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 1420 + }, + { + "epoch": 1.2031973075305007, + "grad_norm": 0.3349577486515045, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1430 + }, + { + "epoch": 1.2116112747160286, + "grad_norm": 0.32591062784194946, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1440 + }, + { + "epoch": 1.2200252419015567, + "grad_norm": 0.3840635418891907, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 1450 + }, + { + "epoch": 1.2284392090870846, + "grad_norm": 0.37238365411758423, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1460 + }, + { + "epoch": 1.2368531762726125, + "grad_norm": 0.3731217682361603, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 1470 + }, + { + "epoch": 1.2452671434581406, + "grad_norm": 0.3318967819213867, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 1480 + }, + { + "epoch": 1.2536811106436685, + "grad_norm": 0.3784034848213196, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 1490 + }, + { + "epoch": 1.2620950778291964, + "grad_norm": 0.3541383147239685, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 1500 + }, + { + "epoch": 1.2705090450147245, + "grad_norm": 0.35312485694885254, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1510 + }, + { + "epoch": 1.2789230122002524, + "grad_norm": 0.35272929072380066, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1520 + }, + { + "epoch": 1.2873369793857803, + "grad_norm": 0.40988272428512573, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1530 + }, + { + "epoch": 1.2957509465713084, + "grad_norm": 0.3543946146965027, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1540 + }, + { + "epoch": 1.3041649137568363, + "grad_norm": 0.35639145970344543, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1550 + }, + { + "epoch": 1.3125788809423642, + "grad_norm": 0.3290826678276062, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1560 + }, + { + "epoch": 1.3209928481278923, + "grad_norm": 0.39264336228370667, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 1570 + }, + { + "epoch": 1.3294068153134202, + "grad_norm": 0.5390415191650391, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1580 + }, + { + "epoch": 1.3378207824989483, + "grad_norm": 0.5188116431236267, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1590 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 0.37445148825645447, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1600 + }, + { + "epoch": 1.3546487168700043, + "grad_norm": 0.3296085298061371, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1610 + }, + { + "epoch": 1.3630626840555322, + "grad_norm": 0.39879581332206726, + "learning_rate": 0.0002, + "loss": 1.8107, + "step": 1620 + }, + { + "epoch": 1.37147665124106, + "grad_norm": 0.36092764139175415, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 1630 + }, + { + "epoch": 1.3798906184265882, + "grad_norm": 0.37011823058128357, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 1640 + }, + { + "epoch": 1.3883045856121161, + "grad_norm": 0.40863534808158875, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1650 + }, + { + "epoch": 1.396718552797644, + "grad_norm": 0.337001770734787, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 1660 + }, + { + "epoch": 1.4051325199831721, + "grad_norm": 0.35596707463264465, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1670 + }, + { + "epoch": 1.4135464871687, + "grad_norm": 0.3857671916484833, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 1680 + }, + { + "epoch": 1.421960454354228, + "grad_norm": 0.419502317905426, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 1690 + }, + { + "epoch": 1.430374421539756, + "grad_norm": 0.35459452867507935, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 1700 + }, + { + "epoch": 1.438788388725284, + "grad_norm": 0.37246978282928467, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 1710 + }, + { + "epoch": 1.4472023559108118, + "grad_norm": 0.33091893792152405, + "learning_rate": 0.0002, + "loss": 1.6762, + "step": 1720 + }, + { + "epoch": 1.45561632309634, + "grad_norm": 0.37029674649238586, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 1730 + }, + { + "epoch": 1.4640302902818678, + "grad_norm": 0.374025821685791, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1740 + }, + { + "epoch": 1.472444257467396, + "grad_norm": 0.3416315019130707, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1750 + }, + { + "epoch": 1.4808582246529238, + "grad_norm": 0.36502841114997864, + "learning_rate": 0.0002, + "loss": 1.7093, + "step": 1760 + }, + { + "epoch": 1.489272191838452, + "grad_norm": 0.35458803176879883, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 1770 + }, + { + "epoch": 1.4976861590239798, + "grad_norm": 0.4462839663028717, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 1780 + }, + { + "epoch": 1.5061001262095077, + "grad_norm": 0.34836092591285706, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 1790 + }, + { + "epoch": 1.5145140933950358, + "grad_norm": 0.3445749282836914, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 1800 + }, + { + "epoch": 1.5229280605805637, + "grad_norm": 0.36012160778045654, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 1810 + }, + { + "epoch": 1.5313420277660916, + "grad_norm": 0.4052616059780121, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 1820 + }, + { + "epoch": 1.5397559949516197, + "grad_norm": 0.3966905474662781, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1830 + }, + { + "epoch": 1.5481699621371476, + "grad_norm": 0.35028719902038574, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 1840 + }, + { + "epoch": 1.5565839293226755, + "grad_norm": 0.3936742842197418, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1850 + }, + { + "epoch": 1.5649978965082036, + "grad_norm": 0.34473296999931335, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 1860 + }, + { + "epoch": 1.5734118636937318, + "grad_norm": 0.4328365623950958, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1870 + }, + { + "epoch": 1.5818258308792594, + "grad_norm": 0.3566315472126007, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1880 + }, + { + "epoch": 1.5902397980647875, + "grad_norm": 0.3301256597042084, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 1890 + }, + { + "epoch": 1.5986537652503157, + "grad_norm": 0.3743041455745697, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 1900 + }, + { + "epoch": 1.6070677324358436, + "grad_norm": 0.3735344707965851, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 1910 + }, + { + "epoch": 1.6154816996213714, + "grad_norm": 0.42191144824028015, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1920 + }, + { + "epoch": 1.6238956668068996, + "grad_norm": 0.3787207305431366, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1930 + }, + { + "epoch": 1.6323096339924275, + "grad_norm": 0.35647350549697876, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1940 + }, + { + "epoch": 1.6407236011779553, + "grad_norm": 0.39791446924209595, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 1950 + }, + { + "epoch": 1.6491375683634835, + "grad_norm": 0.37341275811195374, + "learning_rate": 0.0002, + "loss": 1.7293, + "step": 1960 + }, + { + "epoch": 1.6575515355490114, + "grad_norm": 0.3722686469554901, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1970 + }, + { + "epoch": 1.6659655027345392, + "grad_norm": 0.37467387318611145, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 1980 + }, + { + "epoch": 1.6743794699200674, + "grad_norm": 0.37109461426734924, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 0.4008837044239044, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 2000 + }, + { + "epoch": 1.6912074042911232, + "grad_norm": 0.3316999673843384, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2010 + }, + { + "epoch": 1.6996213714766513, + "grad_norm": 0.3683805465698242, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 2020 + }, + { + "epoch": 1.7080353386621794, + "grad_norm": 0.4163658320903778, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2030 + }, + { + "epoch": 1.716449305847707, + "grad_norm": 0.4245431125164032, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 2040 + }, + { + "epoch": 1.7248632730332352, + "grad_norm": 0.36732038855552673, + "learning_rate": 0.0002, + "loss": 1.7184, + "step": 2050 + }, + { + "epoch": 1.7332772402187633, + "grad_norm": 0.34981656074523926, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 2060 + }, + { + "epoch": 1.7416912074042912, + "grad_norm": 0.38588812947273254, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 2070 + }, + { + "epoch": 1.750105174589819, + "grad_norm": 0.39914557337760925, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 2080 + }, + { + "epoch": 1.7585191417753472, + "grad_norm": 0.36068692803382874, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 2090 + }, + { + "epoch": 1.766933108960875, + "grad_norm": 0.3983287215232849, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 2100 + }, + { + "epoch": 1.775347076146403, + "grad_norm": 0.45008400082588196, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2110 + }, + { + "epoch": 1.783761043331931, + "grad_norm": 0.3618052303791046, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2120 + }, + { + "epoch": 1.792175010517459, + "grad_norm": 0.38745400309562683, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 2130 + }, + { + "epoch": 1.8005889777029869, + "grad_norm": 0.3413826525211334, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 2140 + }, + { + "epoch": 1.809002944888515, + "grad_norm": 0.35983747243881226, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 2150 + }, + { + "epoch": 1.8174169120740429, + "grad_norm": 0.40926849842071533, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 2160 + }, + { + "epoch": 1.8258308792595708, + "grad_norm": 0.3543093800544739, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2170 + }, + { + "epoch": 1.8342448464450989, + "grad_norm": 0.42690935730934143, + "learning_rate": 0.0002, + "loss": 1.7812, + "step": 2180 + }, + { + "epoch": 1.842658813630627, + "grad_norm": 0.40282756090164185, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 2190 + }, + { + "epoch": 1.8510727808161547, + "grad_norm": 0.36568400263786316, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 2200 + }, + { + "epoch": 1.8594867480016828, + "grad_norm": 0.43159013986587524, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 2210 + }, + { + "epoch": 1.867900715187211, + "grad_norm": 0.3554118573665619, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 2220 + }, + { + "epoch": 1.8763146823727388, + "grad_norm": 0.43349072337150574, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 2230 + }, + { + "epoch": 1.8847286495582667, + "grad_norm": 0.36486536264419556, + "learning_rate": 0.0002, + "loss": 1.7302, + "step": 2240 + }, + { + "epoch": 1.8931426167437948, + "grad_norm": 0.39260047674179077, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 2250 + }, + { + "epoch": 1.9015565839293227, + "grad_norm": 0.3741776943206787, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 2260 + }, + { + "epoch": 1.9099705511148506, + "grad_norm": 0.3961946964263916, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2270 + }, + { + "epoch": 1.9183845183003787, + "grad_norm": 0.3659731149673462, + "learning_rate": 0.0002, + "loss": 1.737, + "step": 2280 + }, + { + "epoch": 1.9267984854859066, + "grad_norm": 0.34744107723236084, + "learning_rate": 0.0002, + "loss": 1.7342, + "step": 2290 + }, + { + "epoch": 1.9352124526714345, + "grad_norm": 0.3607442378997803, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2300 + }, + { + "epoch": 1.9436264198569626, + "grad_norm": 0.331464558839798, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 2310 + }, + { + "epoch": 1.9520403870424905, + "grad_norm": 0.3904414474964142, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2320 + }, + { + "epoch": 1.9604543542280184, + "grad_norm": 0.37584832310676575, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 2330 + }, + { + "epoch": 1.9688683214135465, + "grad_norm": 0.3698684275150299, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 2340 + }, + { + "epoch": 1.9772822885990746, + "grad_norm": 0.40571412444114685, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2350 + }, + { + "epoch": 1.9856962557846023, + "grad_norm": 0.40059587359428406, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2360 + }, + { + "epoch": 1.9941102229701304, + "grad_norm": 0.4168248474597931, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2370 + }, + { + "epoch": 2.0, + "eval_loss": 1.8055059909820557, + "eval_runtime": 38.422, + "eval_samples_per_second": 13.404, + "eval_steps_per_second": 1.692, + "step": 2377 + }, + { + "epoch": 2.0025241901556585, + "grad_norm": 0.35205352306365967, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2380 + }, + { + "epoch": 2.010938157341186, + "grad_norm": 0.3979377746582031, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2390 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 0.396491676568985, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 2400 + }, + { + "epoch": 2.0277660917122424, + "grad_norm": 0.44712209701538086, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2410 + }, + { + "epoch": 2.03618005889777, + "grad_norm": 0.4454420208930969, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2420 + }, + { + "epoch": 2.044594026083298, + "grad_norm": 0.4170038402080536, + "learning_rate": 0.0002, + "loss": 1.6635, + "step": 2430 + }, + { + "epoch": 2.0530079932688263, + "grad_norm": 0.4309595227241516, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2440 + }, + { + "epoch": 2.0614219604543544, + "grad_norm": 0.4241602122783661, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2450 + }, + { + "epoch": 2.069835927639882, + "grad_norm": 0.4370540678501129, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 2460 + }, + { + "epoch": 2.0782498948254102, + "grad_norm": 0.43985554575920105, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 2470 + }, + { + "epoch": 2.0866638620109383, + "grad_norm": 0.4158105254173279, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 2480 + }, + { + "epoch": 2.095077829196466, + "grad_norm": 0.441549152135849, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 2490 + }, + { + "epoch": 2.103491796381994, + "grad_norm": 0.385718435049057, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 2500 + }, + { + "epoch": 2.1119057635675222, + "grad_norm": 0.43146514892578125, + "learning_rate": 0.0002, + "loss": 1.5894, + "step": 2510 + }, + { + "epoch": 2.12031973075305, + "grad_norm": 0.41663315892219543, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2520 + }, + { + "epoch": 2.128733697938578, + "grad_norm": 0.4410698115825653, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2530 + }, + { + "epoch": 2.137147665124106, + "grad_norm": 0.4472278952598572, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2540 + }, + { + "epoch": 2.145561632309634, + "grad_norm": 0.3879167437553406, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 2550 + }, + { + "epoch": 2.153975599495162, + "grad_norm": 0.4212203025817871, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 2560 + }, + { + "epoch": 2.16238956668069, + "grad_norm": 0.42841723561286926, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 2.1708035338662177, + "grad_norm": 0.39272481203079224, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 2580 + }, + { + "epoch": 2.179217501051746, + "grad_norm": 0.4075261354446411, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 2590 + }, + { + "epoch": 2.187631468237274, + "grad_norm": 0.5358437895774841, + "learning_rate": 0.0002, + "loss": 1.6601, + "step": 2600 + }, + { + "epoch": 2.1960454354228016, + "grad_norm": 0.4738350212574005, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 2610 + }, + { + "epoch": 2.2044594026083297, + "grad_norm": 0.446789026260376, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2620 + }, + { + "epoch": 2.212873369793858, + "grad_norm": 0.4615374505519867, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2630 + }, + { + "epoch": 2.221287336979386, + "grad_norm": 0.46901994943618774, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2640 + }, + { + "epoch": 2.2297013041649136, + "grad_norm": 0.46267789602279663, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 2.2381152713504417, + "grad_norm": 0.4383080005645752, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2660 + }, + { + "epoch": 2.24652923853597, + "grad_norm": 0.4070609509944916, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 2670 + }, + { + "epoch": 2.2549432057214975, + "grad_norm": 0.4572339951992035, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 2680 + }, + { + "epoch": 2.2633571729070256, + "grad_norm": 0.393265038728714, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 2690 + }, + { + "epoch": 2.2717711400925538, + "grad_norm": 0.46144717931747437, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 2700 + }, + { + "epoch": 2.2801851072780814, + "grad_norm": 0.45077767968177795, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2710 + }, + { + "epoch": 2.2885990744636096, + "grad_norm": 0.5697639584541321, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2720 + }, + { + "epoch": 2.2970130416491377, + "grad_norm": 0.4855510890483856, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 2730 + }, + { + "epoch": 2.3054270088346653, + "grad_norm": 0.4440622627735138, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 2740 + }, + { + "epoch": 2.3138409760201935, + "grad_norm": 0.3904096782207489, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 2750 + }, + { + "epoch": 2.3222549432057216, + "grad_norm": 0.5225510597229004, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2760 + }, + { + "epoch": 2.3306689103912497, + "grad_norm": 0.44866397976875305, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 2770 + }, + { + "epoch": 2.3390828775767774, + "grad_norm": 0.5167056322097778, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2780 + }, + { + "epoch": 2.3474968447623055, + "grad_norm": 0.45913267135620117, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 2790 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 0.45787590742111206, + "learning_rate": 0.0002, + "loss": 1.6564, + "step": 2800 + }, + { + "epoch": 2.3643247791333613, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2810 + }, + { + "epoch": 2.3727387463188894, + "grad_norm": 0.46390071511268616, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 2820 + }, + { + "epoch": 2.3811527135044175, + "grad_norm": 0.4261005222797394, + "learning_rate": 0.0002, + "loss": 1.6039, + "step": 2830 + }, + { + "epoch": 2.389566680689945, + "grad_norm": 0.4283634424209595, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2840 + }, + { + "epoch": 2.3979806478754733, + "grad_norm": 0.4955291450023651, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 2850 + }, + { + "epoch": 2.4063946150610014, + "grad_norm": 0.4740189015865326, + "learning_rate": 0.0002, + "loss": 1.6173, + "step": 2860 + }, + { + "epoch": 2.414808582246529, + "grad_norm": 0.4222276508808136, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2870 + }, + { + "epoch": 2.423222549432057, + "grad_norm": 0.4982149004936218, + "learning_rate": 0.0002, + "loss": 1.5602, + "step": 2880 + }, + { + "epoch": 2.4316365166175853, + "grad_norm": 0.5217409133911133, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2890 + }, + { + "epoch": 2.4400504838031134, + "grad_norm": 0.4555884897708893, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 2900 + }, + { + "epoch": 2.448464450988641, + "grad_norm": 0.43178579211235046, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2910 + }, + { + "epoch": 2.456878418174169, + "grad_norm": 0.4788478910923004, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2920 + }, + { + "epoch": 2.465292385359697, + "grad_norm": 0.43689873814582825, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2930 + }, + { + "epoch": 2.473706352545225, + "grad_norm": 0.5115197896957397, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 2940 + }, + { + "epoch": 2.482120319730753, + "grad_norm": 0.5290159583091736, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 2950 + }, + { + "epoch": 2.490534286916281, + "grad_norm": 0.46042463183403015, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 2960 + }, + { + "epoch": 2.498948254101809, + "grad_norm": 0.4359915852546692, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 2970 + }, + { + "epoch": 2.507362221287337, + "grad_norm": 0.46352964639663696, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2980 + }, + { + "epoch": 2.515776188472865, + "grad_norm": 0.5324268341064453, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2990 + }, + { + "epoch": 2.5241901556583928, + "grad_norm": 0.5929607152938843, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 3000 + }, + { + "epoch": 2.532604122843921, + "grad_norm": 0.4811333417892456, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 3010 + }, + { + "epoch": 2.541018090029449, + "grad_norm": 0.4662701487541199, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 3020 + }, + { + "epoch": 2.549432057214977, + "grad_norm": 0.4582270681858063, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 3030 + }, + { + "epoch": 2.557846024400505, + "grad_norm": 0.4679982662200928, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 3040 + }, + { + "epoch": 2.566259991586033, + "grad_norm": 0.4380294680595398, + "learning_rate": 0.0002, + "loss": 1.5442, + "step": 3050 + }, + { + "epoch": 2.5746739587715606, + "grad_norm": 0.44295763969421387, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 3060 + }, + { + "epoch": 2.5830879259570887, + "grad_norm": 0.5131027698516846, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3070 + }, + { + "epoch": 2.591501893142617, + "grad_norm": 0.47567516565322876, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 3080 + }, + { + "epoch": 2.599915860328145, + "grad_norm": 0.49002596735954285, + "learning_rate": 0.0002, + "loss": 1.5671, + "step": 3090 + }, + { + "epoch": 2.6083298275136726, + "grad_norm": 0.44856327772140503, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3100 + }, + { + "epoch": 2.6167437946992007, + "grad_norm": 0.4480142593383789, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 3110 + }, + { + "epoch": 2.6251577618847284, + "grad_norm": 0.4317494034767151, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 3120 + }, + { + "epoch": 2.6335717290702565, + "grad_norm": 0.42580848932266235, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 3130 + }, + { + "epoch": 2.6419856962557846, + "grad_norm": 0.4516814947128296, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 3140 + }, + { + "epoch": 2.6503996634413127, + "grad_norm": 0.4438435733318329, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 3150 + }, + { + "epoch": 2.6588136306268404, + "grad_norm": 0.4385356307029724, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 3160 + }, + { + "epoch": 2.6672275978123685, + "grad_norm": 0.5064112544059753, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 3170 + }, + { + "epoch": 2.6756415649978966, + "grad_norm": 0.49163177609443665, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 3180 + }, + { + "epoch": 2.6840555321834243, + "grad_norm": 0.49339258670806885, + "learning_rate": 0.0002, + "loss": 1.7323, + "step": 3190 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 0.440950870513916, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 3200 + }, + { + "epoch": 2.7008834665544805, + "grad_norm": 0.4283970594406128, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 3210 + }, + { + "epoch": 2.7092974337400086, + "grad_norm": 0.43875712156295776, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 3220 + }, + { + "epoch": 2.7177114009255363, + "grad_norm": 0.49332964420318604, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 3230 + }, + { + "epoch": 2.7261253681110644, + "grad_norm": 0.5225692391395569, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3240 + }, + { + "epoch": 2.734539335296592, + "grad_norm": 0.4856489300727844, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3250 + }, + { + "epoch": 2.74295330248212, + "grad_norm": 0.46918296813964844, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 3260 + }, + { + "epoch": 2.7513672696676483, + "grad_norm": 0.4802931249141693, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 3270 + }, + { + "epoch": 2.7597812368531764, + "grad_norm": 0.4485355615615845, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 3280 + }, + { + "epoch": 2.768195204038704, + "grad_norm": 0.43944594264030457, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 3290 + }, + { + "epoch": 2.7766091712242322, + "grad_norm": 0.46847742795944214, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 3300 + }, + { + "epoch": 2.7850231384097603, + "grad_norm": 0.4816027879714966, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 3310 + }, + { + "epoch": 2.793437105595288, + "grad_norm": 0.453960120677948, + "learning_rate": 0.0002, + "loss": 1.6293, + "step": 3320 + }, + { + "epoch": 2.801851072780816, + "grad_norm": 0.4816017150878906, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 3330 + }, + { + "epoch": 2.8102650399663442, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.0002, + "loss": 1.6683, + "step": 3340 + }, + { + "epoch": 2.8186790071518724, + "grad_norm": 0.48821821808815, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 3350 + }, + { + "epoch": 2.8270929743374, + "grad_norm": 0.4574853777885437, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3360 + }, + { + "epoch": 2.835506941522928, + "grad_norm": 0.42062026262283325, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 3370 + }, + { + "epoch": 2.843920908708456, + "grad_norm": 0.4499834477901459, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3380 + }, + { + "epoch": 2.852334875893984, + "grad_norm": 0.4780360758304596, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3390 + }, + { + "epoch": 2.860748843079512, + "grad_norm": 0.45422887802124023, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 3400 + }, + { + "epoch": 2.86916281026504, + "grad_norm": 0.4590015709400177, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 3410 + }, + { + "epoch": 2.877576777450568, + "grad_norm": 0.45689624547958374, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 3420 + }, + { + "epoch": 2.885990744636096, + "grad_norm": 0.46953922510147095, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.8944047118216236, + "grad_norm": 0.4791966378688812, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3440 + }, + { + "epoch": 2.9028186790071517, + "grad_norm": 0.4842296242713928, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 3450 + }, + { + "epoch": 2.91123264619268, + "grad_norm": 0.47219768166542053, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3460 + }, + { + "epoch": 2.919646613378208, + "grad_norm": 0.4622127115726471, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3470 + }, + { + "epoch": 2.9280605805637356, + "grad_norm": 0.46832820773124695, + "learning_rate": 0.0002, + "loss": 1.6485, + "step": 3480 + }, + { + "epoch": 2.9364745477492638, + "grad_norm": 0.44582483172416687, + "learning_rate": 0.0002, + "loss": 1.6366, + "step": 3490 + }, + { + "epoch": 2.944888514934792, + "grad_norm": 0.4987219274044037, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 3500 + }, + { + "epoch": 2.9533024821203195, + "grad_norm": 0.43750956654548645, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 3510 + }, + { + "epoch": 2.9617164493058477, + "grad_norm": 0.49962925910949707, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 3520 + }, + { + "epoch": 2.9701304164913758, + "grad_norm": 0.5189590454101562, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 3530 + }, + { + "epoch": 2.978544383676904, + "grad_norm": 0.391317754983902, + "learning_rate": 0.0002, + "loss": 1.6688, + "step": 3540 + }, + { + "epoch": 2.9869583508624316, + "grad_norm": 0.44934695959091187, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 3550 + }, + { + "epoch": 2.9953723180479597, + "grad_norm": 0.4740142226219177, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 3560 + }, + { + "epoch": 2.9995793016407237, + "eval_loss": 1.8266887664794922, + "eval_runtime": 37.9445, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3565 + }, + { + "epoch": 3.003786285233488, + "grad_norm": 0.4523724615573883, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 3570 + }, + { + "epoch": 3.0122002524190155, + "grad_norm": 0.5261380076408386, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3580 + }, + { + "epoch": 3.0206142196045436, + "grad_norm": 0.48664888739585876, + "learning_rate": 0.0002, + "loss": 1.4946, + "step": 3590 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 0.5070882439613342, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 3600 + }, + { + "epoch": 3.0374421539755994, + "grad_norm": 0.5816011428833008, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 3610 + }, + { + "epoch": 3.0458561211611275, + "grad_norm": 0.6610211730003357, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 3620 + }, + { + "epoch": 3.0542700883466556, + "grad_norm": 0.5257703065872192, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 3630 + }, + { + "epoch": 3.0626840555321833, + "grad_norm": 0.5574390888214111, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 3640 + }, + { + "epoch": 3.0710980227177114, + "grad_norm": 0.5682297348976135, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3650 + }, + { + "epoch": 3.0795119899032395, + "grad_norm": 0.5798383355140686, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3660 + }, + { + "epoch": 3.087925957088767, + "grad_norm": 0.5458289980888367, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 3670 + }, + { + "epoch": 3.0963399242742953, + "grad_norm": 0.5599102973937988, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 3680 + }, + { + "epoch": 3.1047538914598234, + "grad_norm": 0.5023021697998047, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 3690 + }, + { + "epoch": 3.113167858645351, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3700 + }, + { + "epoch": 3.121581825830879, + "grad_norm": 0.5760458707809448, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 3710 + }, + { + "epoch": 3.1299957930164073, + "grad_norm": 0.6018968224525452, + "learning_rate": 0.0002, + "loss": 1.4789, + "step": 3720 + }, + { + "epoch": 3.1384097602019354, + "grad_norm": 0.5767101049423218, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 3730 + }, + { + "epoch": 3.146823727387463, + "grad_norm": 0.5333963632583618, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 3740 + }, + { + "epoch": 3.155237694572991, + "grad_norm": 0.5918396711349487, + "learning_rate": 0.0002, + "loss": 1.4812, + "step": 3750 + }, + { + "epoch": 3.1636516617585193, + "grad_norm": 0.5931203365325928, + "learning_rate": 0.0002, + "loss": 1.4618, + "step": 3760 + }, + { + "epoch": 3.172065628944047, + "grad_norm": 0.6562168598175049, + "learning_rate": 0.0002, + "loss": 1.5592, + "step": 3770 + }, + { + "epoch": 3.180479596129575, + "grad_norm": 0.5820156335830688, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 3780 + }, + { + "epoch": 3.188893563315103, + "grad_norm": 0.5784737467765808, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 3790 + }, + { + "epoch": 3.197307530500631, + "grad_norm": 0.5506529808044434, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 3800 + }, + { + "epoch": 3.205721497686159, + "grad_norm": 0.6101595163345337, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3810 + }, + { + "epoch": 3.214135464871687, + "grad_norm": 0.5597806572914124, + "learning_rate": 0.0002, + "loss": 1.5185, + "step": 3820 + }, + { + "epoch": 3.222549432057215, + "grad_norm": 0.5641011595726013, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 3830 + }, + { + "epoch": 3.230963399242743, + "grad_norm": 0.5892080068588257, + "learning_rate": 0.0002, + "loss": 1.4702, + "step": 3840 + }, + { + "epoch": 3.239377366428271, + "grad_norm": 0.6034760475158691, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 3850 + }, + { + "epoch": 3.247791333613799, + "grad_norm": 0.5112439393997192, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 3860 + }, + { + "epoch": 3.256205300799327, + "grad_norm": 0.56565922498703, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 3870 + }, + { + "epoch": 3.264619267984855, + "grad_norm": 0.6155247092247009, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 3880 + }, + { + "epoch": 3.273033235170383, + "grad_norm": 0.6064623594284058, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 3890 + }, + { + "epoch": 3.2814472023559107, + "grad_norm": 0.6313768029212952, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 3900 + }, + { + "epoch": 3.289861169541439, + "grad_norm": 0.5903939008712769, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 3910 + }, + { + "epoch": 3.298275136726967, + "grad_norm": 0.5770667195320129, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 3920 + }, + { + "epoch": 3.3066891039124946, + "grad_norm": 0.5785196423530579, + "learning_rate": 0.0002, + "loss": 1.5159, + "step": 3930 + }, + { + "epoch": 3.3151030710980227, + "grad_norm": 0.6468310356140137, + "learning_rate": 0.0002, + "loss": 1.5277, + "step": 3940 + }, + { + "epoch": 3.323517038283551, + "grad_norm": 0.6200279593467712, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 3950 + }, + { + "epoch": 3.3319310054690785, + "grad_norm": 0.5779302716255188, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3960 + }, + { + "epoch": 3.3403449726546066, + "grad_norm": 0.5463796854019165, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 3970 + }, + { + "epoch": 3.3487589398401347, + "grad_norm": 0.6117855906486511, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3980 + }, + { + "epoch": 3.357172907025663, + "grad_norm": 0.5554766058921814, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3990 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 0.6012870073318481, + "learning_rate": 0.0002, + "loss": 1.5004, + "step": 4000 + }, + { + "epoch": 3.3740008413967186, + "grad_norm": 0.5443974137306213, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 4010 + }, + { + "epoch": 3.3824148085822463, + "grad_norm": 0.6636057496070862, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4020 + }, + { + "epoch": 3.3908287757677744, + "grad_norm": 0.5801246166229248, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 4030 + }, + { + "epoch": 3.3992427429533025, + "grad_norm": 0.5668839812278748, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4040 + }, + { + "epoch": 3.4076567101388306, + "grad_norm": 0.7763481736183167, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 4050 + }, + { + "epoch": 3.4160706773243583, + "grad_norm": 0.6675992608070374, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4060 + }, + { + "epoch": 3.4244846445098864, + "grad_norm": 0.6290077567100525, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 4070 + }, + { + "epoch": 3.4328986116954145, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 4080 + }, + { + "epoch": 3.441312578880942, + "grad_norm": 0.6237877607345581, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4090 + }, + { + "epoch": 3.4497265460664703, + "grad_norm": 0.5343508124351501, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4100 + }, + { + "epoch": 3.4581405132519984, + "grad_norm": 0.6817412972450256, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 4110 + }, + { + "epoch": 3.466554480437526, + "grad_norm": 0.7115170359611511, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 4120 + }, + { + "epoch": 3.4749684476230542, + "grad_norm": 0.6127332448959351, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4130 + }, + { + "epoch": 3.4833824148085824, + "grad_norm": 0.5745994448661804, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4140 + }, + { + "epoch": 3.49179638199411, + "grad_norm": 0.6248795390129089, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 3.500210349179638, + "grad_norm": 0.5821124911308289, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 4160 + }, + { + "epoch": 3.5086243163651663, + "grad_norm": 0.561416506767273, + "learning_rate": 0.0002, + "loss": 1.4937, + "step": 4170 + }, + { + "epoch": 3.5170382835506944, + "grad_norm": 0.5848962664604187, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4180 + }, + { + "epoch": 3.525452250736222, + "grad_norm": 0.5335569977760315, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 4190 + }, + { + "epoch": 3.53386621792175, + "grad_norm": 0.547964870929718, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 4200 + }, + { + "epoch": 3.542280185107278, + "grad_norm": 0.6157727241516113, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 4210 + }, + { + "epoch": 3.550694152292806, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 4220 + }, + { + "epoch": 3.559108119478334, + "grad_norm": 0.5844616293907166, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 4230 + }, + { + "epoch": 3.567522086663862, + "grad_norm": 0.7104926109313965, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4240 + }, + { + "epoch": 3.57593605384939, + "grad_norm": 0.5055213570594788, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4250 + }, + { + "epoch": 3.584350021034918, + "grad_norm": 0.611676812171936, + "learning_rate": 0.0002, + "loss": 1.482, + "step": 4260 + }, + { + "epoch": 3.592763988220446, + "grad_norm": 0.6326440572738647, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4270 + }, + { + "epoch": 3.6011779554059737, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4280 + }, + { + "epoch": 3.609591922591502, + "grad_norm": 0.5691978931427002, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 4290 + }, + { + "epoch": 3.61800588977703, + "grad_norm": 0.6071329116821289, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 4300 + }, + { + "epoch": 3.626419856962558, + "grad_norm": 0.606573224067688, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 4310 + }, + { + "epoch": 3.6348338241480858, + "grad_norm": 0.5515419244766235, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 4320 + }, + { + "epoch": 3.643247791333614, + "grad_norm": 0.5964660048484802, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4330 + }, + { + "epoch": 3.6516617585191415, + "grad_norm": 0.5774146914482117, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 4340 + }, + { + "epoch": 3.6600757257046697, + "grad_norm": 0.5732731223106384, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 4350 + }, + { + "epoch": 3.6684896928901978, + "grad_norm": 0.7354163527488708, + "learning_rate": 0.0002, + "loss": 1.5682, + "step": 4360 + }, + { + "epoch": 3.676903660075726, + "grad_norm": 0.6220902800559998, + "learning_rate": 0.0002, + "loss": 1.5225, + "step": 4370 + }, + { + "epoch": 3.6853176272612536, + "grad_norm": 0.6053991317749023, + "learning_rate": 0.0002, + "loss": 1.4838, + "step": 4380 + }, + { + "epoch": 3.6937315944467817, + "grad_norm": 0.67010897397995, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4390 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 0.6139186024665833, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 4400 + }, + { + "epoch": 3.7105595288178375, + "grad_norm": 0.5433071851730347, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 4410 + }, + { + "epoch": 3.7189734960033656, + "grad_norm": 0.5453870296478271, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 4420 + }, + { + "epoch": 3.7273874631888937, + "grad_norm": 0.6401727199554443, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 4430 + }, + { + "epoch": 3.735801430374422, + "grad_norm": 0.6049367189407349, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 4440 + }, + { + "epoch": 3.7442153975599495, + "grad_norm": 0.5740529298782349, + "learning_rate": 0.0002, + "loss": 1.5268, + "step": 4450 + }, + { + "epoch": 3.7526293647454776, + "grad_norm": 0.6521880626678467, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 4460 + }, + { + "epoch": 3.7610433319310053, + "grad_norm": 0.7096368074417114, + "learning_rate": 0.0002, + "loss": 1.5741, + "step": 4470 + }, + { + "epoch": 3.7694572991165334, + "grad_norm": 0.5886474251747131, + "learning_rate": 0.0002, + "loss": 1.5786, + "step": 4480 + }, + { + "epoch": 3.7778712663020615, + "grad_norm": 0.5821043252944946, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4490 + }, + { + "epoch": 3.7862852334875896, + "grad_norm": 0.628892183303833, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 4500 + }, + { + "epoch": 3.7946992006731173, + "grad_norm": 0.5962669849395752, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4510 + }, + { + "epoch": 3.8031131678586454, + "grad_norm": 0.6635549068450928, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4520 + }, + { + "epoch": 3.811527135044173, + "grad_norm": 0.6010760068893433, + "learning_rate": 0.0002, + "loss": 1.5058, + "step": 4530 + }, + { + "epoch": 3.819941102229701, + "grad_norm": 0.6322658658027649, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 4540 + }, + { + "epoch": 3.8283550694152293, + "grad_norm": 0.5893137454986572, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 4550 + }, + { + "epoch": 3.8367690366007574, + "grad_norm": 0.7829602360725403, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 4560 + }, + { + "epoch": 3.845183003786285, + "grad_norm": 0.6190396547317505, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 4570 + }, + { + "epoch": 3.853596970971813, + "grad_norm": 0.6662813425064087, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 4580 + }, + { + "epoch": 3.8620109381573413, + "grad_norm": 0.5809855461120605, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4590 + }, + { + "epoch": 3.870424905342869, + "grad_norm": 0.5779069662094116, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4600 + }, + { + "epoch": 3.878838872528397, + "grad_norm": 0.5603038668632507, + "learning_rate": 0.0002, + "loss": 1.498, + "step": 4610 + }, + { + "epoch": 3.887252839713925, + "grad_norm": 0.6274181008338928, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 4620 + }, + { + "epoch": 3.8956668068994533, + "grad_norm": 0.6810959577560425, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 4630 + }, + { + "epoch": 3.904080774084981, + "grad_norm": 0.5647315979003906, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 4640 + }, + { + "epoch": 3.912494741270509, + "grad_norm": 0.6830295324325562, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4650 + }, + { + "epoch": 3.920908708456037, + "grad_norm": 0.652565598487854, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 4660 + }, + { + "epoch": 3.929322675641565, + "grad_norm": 0.5806284546852112, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 4670 + }, + { + "epoch": 3.937736642827093, + "grad_norm": 0.6825073957443237, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 4680 + }, + { + "epoch": 3.946150610012621, + "grad_norm": 0.6149451732635498, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 4690 + }, + { + "epoch": 3.954564577198149, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4700 + }, + { + "epoch": 3.962978544383677, + "grad_norm": 0.6239011883735657, + "learning_rate": 0.0002, + "loss": 1.4897, + "step": 4710 + }, + { + "epoch": 3.971392511569205, + "grad_norm": 0.6485443115234375, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4720 + }, + { + "epoch": 3.9798064787547327, + "grad_norm": 0.6449228525161743, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4730 + }, + { + "epoch": 3.988220445940261, + "grad_norm": 0.6526407599449158, + "learning_rate": 0.0002, + "loss": 1.5087, + "step": 4740 + }, + { + "epoch": 3.996634413125789, + "grad_norm": 0.6277706027030945, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4750 + }, + { + "epoch": 4.0, + "eval_loss": 1.871641755104065, + "eval_runtime": 37.9637, + "eval_samples_per_second": 13.566, + "eval_steps_per_second": 1.712, + "step": 4754 + }, + { + "epoch": 4.005048380311317, + "grad_norm": 0.6994837522506714, + "learning_rate": 0.0002, + "loss": 1.4744, + "step": 4760 + }, + { + "epoch": 4.013462347496845, + "grad_norm": 0.8728373050689697, + "learning_rate": 0.0002, + "loss": 1.4433, + "step": 4770 + }, + { + "epoch": 4.021876314682372, + "grad_norm": 0.688679575920105, + "learning_rate": 0.0002, + "loss": 1.3329, + "step": 4780 + }, + { + "epoch": 4.0302902818679005, + "grad_norm": 0.6313387155532837, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 4790 + }, + { + "epoch": 4.038704249053429, + "grad_norm": 0.6577984690666199, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 4800 + }, + { + "epoch": 4.047118216238957, + "grad_norm": 0.7938185930252075, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 4810 + }, + { + "epoch": 4.055532183424485, + "grad_norm": 0.760399580001831, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4820 + }, + { + "epoch": 4.063946150610013, + "grad_norm": 0.7329602241516113, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 4830 + }, + { + "epoch": 4.07236011779554, + "grad_norm": 0.7778576016426086, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 4840 + }, + { + "epoch": 4.080774084981068, + "grad_norm": 0.8235865235328674, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 4850 + }, + { + "epoch": 4.089188052166596, + "grad_norm": 0.7743754386901855, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 4860 + }, + { + "epoch": 4.0976020193521245, + "grad_norm": 0.8145367503166199, + "learning_rate": 0.0002, + "loss": 1.3787, + "step": 4870 + }, + { + "epoch": 4.106015986537653, + "grad_norm": 0.8517307639122009, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 4880 + }, + { + "epoch": 4.114429953723181, + "grad_norm": 0.8208953142166138, + "learning_rate": 0.0002, + "loss": 1.4191, + "step": 4890 + }, + { + "epoch": 4.122843920908709, + "grad_norm": 0.8437790870666504, + "learning_rate": 0.0002, + "loss": 1.3189, + "step": 4900 + }, + { + "epoch": 4.131257888094236, + "grad_norm": 0.716672420501709, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4910 + }, + { + "epoch": 4.139671855279764, + "grad_norm": 0.7656235098838806, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 4920 + }, + { + "epoch": 4.148085822465292, + "grad_norm": 0.7209306955337524, + "learning_rate": 0.0002, + "loss": 1.3408, + "step": 4930 + }, + { + "epoch": 4.1564997896508205, + "grad_norm": 0.7731267809867859, + "learning_rate": 0.0002, + "loss": 1.3639, + "step": 4940 + }, + { + "epoch": 4.164913756836349, + "grad_norm": 0.7477553486824036, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 4950 + }, + { + "epoch": 4.173327724021877, + "grad_norm": 0.7372981309890747, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 4960 + }, + { + "epoch": 4.181741691207404, + "grad_norm": 0.6582154035568237, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4970 + }, + { + "epoch": 4.190155658392932, + "grad_norm": 0.7003206610679626, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 4980 + }, + { + "epoch": 4.19856962557846, + "grad_norm": 0.735223650932312, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 4990 + }, + { + "epoch": 4.206983592763988, + "grad_norm": 0.7832302451133728, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 5000 + }, + { + "epoch": 4.215397559949516, + "grad_norm": 0.8819546103477478, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5010 + }, + { + "epoch": 4.2238115271350445, + "grad_norm": 0.9325336813926697, + "learning_rate": 0.0002, + "loss": 1.4438, + "step": 5020 + }, + { + "epoch": 4.232225494320572, + "grad_norm": 0.7007517218589783, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 5030 + }, + { + "epoch": 4.2406394615061, + "grad_norm": 0.7118321061134338, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 5040 + }, + { + "epoch": 4.249053428691628, + "grad_norm": 0.6578946709632874, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5050 + }, + { + "epoch": 4.257467395877156, + "grad_norm": 0.9438983798027039, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 5060 + }, + { + "epoch": 4.265881363062684, + "grad_norm": 0.703037679195404, + "learning_rate": 0.0002, + "loss": 1.3868, + "step": 5070 + }, + { + "epoch": 4.274295330248212, + "grad_norm": 0.7286025285720825, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 5080 + }, + { + "epoch": 4.28270929743374, + "grad_norm": 0.750689685344696, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5090 + }, + { + "epoch": 4.291123264619268, + "grad_norm": 0.869753360748291, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 5100 + }, + { + "epoch": 4.299537231804796, + "grad_norm": 0.8712980151176453, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 5110 + }, + { + "epoch": 4.307951198990324, + "grad_norm": 0.690263569355011, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 5120 + }, + { + "epoch": 4.316365166175852, + "grad_norm": 0.7114760279655457, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 5130 + }, + { + "epoch": 4.32477913336138, + "grad_norm": 0.7588112354278564, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5140 + }, + { + "epoch": 4.333193100546908, + "grad_norm": 0.7556202411651611, + "learning_rate": 0.0002, + "loss": 1.4408, + "step": 5150 + }, + { + "epoch": 4.341607067732435, + "grad_norm": 0.8357610702514648, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5160 + }, + { + "epoch": 4.3500210349179635, + "grad_norm": 0.8054035902023315, + "learning_rate": 0.0002, + "loss": 1.3348, + "step": 5170 + }, + { + "epoch": 4.358435002103492, + "grad_norm": 0.7637107968330383, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5180 + }, + { + "epoch": 4.36684896928902, + "grad_norm": 0.757481038570404, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 5190 + }, + { + "epoch": 4.375262936474548, + "grad_norm": 0.7185863852500916, + "learning_rate": 0.0002, + "loss": 1.3622, + "step": 5200 + }, + { + "epoch": 4.383676903660076, + "grad_norm": 0.7326455116271973, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 5210 + }, + { + "epoch": 4.392090870845603, + "grad_norm": 0.7980523109436035, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 5220 + }, + { + "epoch": 4.400504838031131, + "grad_norm": 0.8526999354362488, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 5230 + }, + { + "epoch": 4.4089188052166595, + "grad_norm": 0.7012337446212769, + "learning_rate": 0.0002, + "loss": 1.4022, + "step": 5240 + }, + { + "epoch": 4.417332772402188, + "grad_norm": 0.8217827677726746, + "learning_rate": 0.0002, + "loss": 1.3552, + "step": 5250 + }, + { + "epoch": 4.425746739587716, + "grad_norm": 0.7141005396842957, + "learning_rate": 0.0002, + "loss": 1.3482, + "step": 5260 + }, + { + "epoch": 4.434160706773244, + "grad_norm": 0.7094302177429199, + "learning_rate": 0.0002, + "loss": 1.3699, + "step": 5270 + }, + { + "epoch": 4.442574673958772, + "grad_norm": 0.7234613299369812, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 5280 + }, + { + "epoch": 4.450988641144299, + "grad_norm": 0.7530457973480225, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5290 + }, + { + "epoch": 4.459402608329827, + "grad_norm": 0.7300912141799927, + "learning_rate": 0.0002, + "loss": 1.3944, + "step": 5300 + }, + { + "epoch": 4.467816575515355, + "grad_norm": 0.825443685054779, + "learning_rate": 0.0002, + "loss": 1.3844, + "step": 5310 + }, + { + "epoch": 4.4762305427008835, + "grad_norm": 0.7559658885002136, + "learning_rate": 0.0002, + "loss": 1.3648, + "step": 5320 + }, + { + "epoch": 4.484644509886412, + "grad_norm": 0.8817561268806458, + "learning_rate": 0.0002, + "loss": 1.4364, + "step": 5330 + }, + { + "epoch": 4.49305847707194, + "grad_norm": 0.8203575611114502, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 5340 + }, + { + "epoch": 4.501472444257468, + "grad_norm": 0.7677690982818604, + "learning_rate": 0.0002, + "loss": 1.3996, + "step": 5350 + }, + { + "epoch": 4.509886411442995, + "grad_norm": 0.657085120677948, + "learning_rate": 0.0002, + "loss": 1.4142, + "step": 5360 + }, + { + "epoch": 4.518300378628523, + "grad_norm": 0.7939504384994507, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 5370 + }, + { + "epoch": 4.526714345814051, + "grad_norm": 0.6971889138221741, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5380 + }, + { + "epoch": 4.535128312999579, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 5390 + }, + { + "epoch": 4.5435422801851075, + "grad_norm": 0.8504858613014221, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 5400 + }, + { + "epoch": 4.551956247370635, + "grad_norm": 0.9134073853492737, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5410 + }, + { + "epoch": 4.560370214556163, + "grad_norm": 0.7765598893165588, + "learning_rate": 0.0002, + "loss": 1.4375, + "step": 5420 + }, + { + "epoch": 4.568784181741691, + "grad_norm": 0.6991009712219238, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 5430 + }, + { + "epoch": 4.577198148927219, + "grad_norm": 0.8393039107322693, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5440 + }, + { + "epoch": 4.585612116112747, + "grad_norm": 0.7685918211936951, + "learning_rate": 0.0002, + "loss": 1.3976, + "step": 5450 + }, + { + "epoch": 4.594026083298275, + "grad_norm": 0.7135679721832275, + "learning_rate": 0.0002, + "loss": 1.3883, + "step": 5460 + }, + { + "epoch": 4.6024400504838034, + "grad_norm": 0.6728870868682861, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 5470 + }, + { + "epoch": 4.610854017669331, + "grad_norm": 0.7139479517936707, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5480 + }, + { + "epoch": 4.619267984854859, + "grad_norm": 0.8476598858833313, + "learning_rate": 0.0002, + "loss": 1.3498, + "step": 5490 + }, + { + "epoch": 4.627681952040387, + "grad_norm": 0.8034361004829407, + "learning_rate": 0.0002, + "loss": 1.3389, + "step": 5500 + }, + { + "epoch": 4.636095919225915, + "grad_norm": 0.7452183961868286, + "learning_rate": 0.0002, + "loss": 1.4179, + "step": 5510 + }, + { + "epoch": 4.644509886411443, + "grad_norm": 0.8394148945808411, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5520 + }, + { + "epoch": 4.652923853596971, + "grad_norm": 0.7480153441429138, + "learning_rate": 0.0002, + "loss": 1.4561, + "step": 5530 + }, + { + "epoch": 4.661337820782499, + "grad_norm": 0.7781714797019958, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5540 + }, + { + "epoch": 4.669751787968027, + "grad_norm": 1.0058213472366333, + "learning_rate": 0.0002, + "loss": 1.3924, + "step": 5550 + }, + { + "epoch": 4.678165755153555, + "grad_norm": 0.7403179407119751, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 5560 + }, + { + "epoch": 4.686579722339083, + "grad_norm": 0.7270476818084717, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 5570 + }, + { + "epoch": 4.694993689524611, + "grad_norm": 0.760877788066864, + "learning_rate": 0.0002, + "loss": 1.378, + "step": 5580 + }, + { + "epoch": 4.703407656710139, + "grad_norm": 0.8097004890441895, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5590 + }, + { + "epoch": 4.711821623895667, + "grad_norm": 0.9096523523330688, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 5600 + }, + { + "epoch": 4.720235591081195, + "grad_norm": 0.7262444496154785, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5610 + }, + { + "epoch": 4.7286495582667225, + "grad_norm": 0.8207762837409973, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 5620 + }, + { + "epoch": 4.737063525452251, + "grad_norm": 0.8089601993560791, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 5630 + }, + { + "epoch": 4.745477492637779, + "grad_norm": 0.7609543800354004, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 5640 + }, + { + "epoch": 4.753891459823307, + "grad_norm": 0.7273501753807068, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 5650 + }, + { + "epoch": 4.762305427008835, + "grad_norm": 0.7800219058990479, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5660 + }, + { + "epoch": 4.770719394194362, + "grad_norm": 0.8558377623558044, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 5670 + }, + { + "epoch": 4.77913336137989, + "grad_norm": 0.7131547927856445, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 5680 + }, + { + "epoch": 4.787547328565418, + "grad_norm": 0.7651025056838989, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 5690 + }, + { + "epoch": 4.7959612957509465, + "grad_norm": 0.8129976391792297, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 5700 + }, + { + "epoch": 4.804375262936475, + "grad_norm": 0.8019895553588867, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 5710 + }, + { + "epoch": 4.812789230122003, + "grad_norm": 0.7692018151283264, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 5720 + }, + { + "epoch": 4.821203197307531, + "grad_norm": 0.6893943548202515, + "learning_rate": 0.0002, + "loss": 1.419, + "step": 5730 + }, + { + "epoch": 4.829617164493058, + "grad_norm": 0.6881810426712036, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5740 + }, + { + "epoch": 4.838031131678586, + "grad_norm": 0.7838267683982849, + "learning_rate": 0.0002, + "loss": 1.4775, + "step": 5750 + }, + { + "epoch": 4.846445098864114, + "grad_norm": 0.727799117565155, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5760 + }, + { + "epoch": 4.8548590660496425, + "grad_norm": 0.7458277344703674, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5770 + }, + { + "epoch": 4.863273033235171, + "grad_norm": 0.903802216053009, + "learning_rate": 0.0002, + "loss": 1.4426, + "step": 5780 + }, + { + "epoch": 4.871687000420699, + "grad_norm": 0.7983472347259521, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 5790 + }, + { + "epoch": 4.880100967606227, + "grad_norm": 0.6894361972808838, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 5800 + }, + { + "epoch": 4.888514934791754, + "grad_norm": 0.7499409317970276, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 5810 + }, + { + "epoch": 4.896928901977282, + "grad_norm": 0.7362820506095886, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 5820 + }, + { + "epoch": 4.90534286916281, + "grad_norm": 0.8341619968414307, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 5830 + }, + { + "epoch": 4.913756836348338, + "grad_norm": 0.9604470133781433, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5840 + }, + { + "epoch": 4.9221708035338665, + "grad_norm": 0.8916844129562378, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5850 + }, + { + "epoch": 4.930584770719394, + "grad_norm": 0.8519647121429443, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 5860 + }, + { + "epoch": 4.938998737904922, + "grad_norm": 0.7946906089782715, + "learning_rate": 0.0002, + "loss": 1.424, + "step": 5870 + }, + { + "epoch": 4.94741270509045, + "grad_norm": 0.7843789458274841, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 5880 + }, + { + "epoch": 4.955826672275978, + "grad_norm": 0.707618772983551, + "learning_rate": 0.0002, + "loss": 1.4021, + "step": 5890 + }, + { + "epoch": 4.964240639461506, + "grad_norm": 0.7704206109046936, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 5900 + }, + { + "epoch": 4.972654606647034, + "grad_norm": 0.7160256505012512, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5910 + }, + { + "epoch": 4.981068573832562, + "grad_norm": 0.7020420432090759, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 5920 + }, + { + "epoch": 4.98948254101809, + "grad_norm": 0.7576286792755127, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 5930 + }, + { + "epoch": 4.997896508203618, + "grad_norm": 0.8573036789894104, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 5940 + }, + { + "epoch": 4.999579301640724, + "eval_loss": 1.9353811740875244, + "eval_runtime": 37.9208, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 5942 + }, + { + "epoch": 5.006310475389146, + "grad_norm": 0.8204267621040344, + "learning_rate": 0.0002, + "loss": 1.2418, + "step": 5950 + }, + { + "epoch": 5.014724442574674, + "grad_norm": 0.976840615272522, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 5960 + }, + { + "epoch": 5.023138409760202, + "grad_norm": 0.8765613436698914, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5970 + }, + { + "epoch": 5.03155237694573, + "grad_norm": 1.1793042421340942, + "learning_rate": 0.0002, + "loss": 1.2748, + "step": 5980 + }, + { + "epoch": 5.039966344131258, + "grad_norm": 0.971062958240509, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 5990 + }, + { + "epoch": 5.0483803113167856, + "grad_norm": 0.8649757504463196, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 6000 + }, + { + "epoch": 5.056794278502314, + "grad_norm": 0.9563034176826477, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 6010 + }, + { + "epoch": 5.065208245687842, + "grad_norm": 1.0093994140625, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 6020 + }, + { + "epoch": 5.07362221287337, + "grad_norm": 1.004213571548462, + "learning_rate": 0.0002, + "loss": 1.2519, + "step": 6030 + }, + { + "epoch": 5.082036180058898, + "grad_norm": 0.8307787179946899, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 6040 + }, + { + "epoch": 5.090450147244426, + "grad_norm": 0.9117848873138428, + "learning_rate": 0.0002, + "loss": 1.2282, + "step": 6050 + }, + { + "epoch": 5.098864114429953, + "grad_norm": 1.0269840955734253, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6060 + }, + { + "epoch": 5.1072780816154815, + "grad_norm": 0.9079542756080627, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6070 + }, + { + "epoch": 5.11569204880101, + "grad_norm": 0.885702908039093, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6080 + }, + { + "epoch": 5.124106015986538, + "grad_norm": 0.9976128339767456, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 6090 + }, + { + "epoch": 5.132519983172066, + "grad_norm": 0.8472117185592651, + "learning_rate": 0.0002, + "loss": 1.3082, + "step": 6100 + }, + { + "epoch": 5.140933950357594, + "grad_norm": 1.0385161638259888, + "learning_rate": 0.0002, + "loss": 1.226, + "step": 6110 + }, + { + "epoch": 5.149347917543121, + "grad_norm": 0.8948383927345276, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 6120 + }, + { + "epoch": 5.157761884728649, + "grad_norm": 1.2613716125488281, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6130 + }, + { + "epoch": 5.166175851914177, + "grad_norm": 0.9933410286903381, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 6140 + }, + { + "epoch": 5.1745898190997055, + "grad_norm": 0.9673663973808289, + "learning_rate": 0.0002, + "loss": 1.1715, + "step": 6150 + }, + { + "epoch": 5.183003786285234, + "grad_norm": 0.9969648122787476, + "learning_rate": 0.0002, + "loss": 1.2947, + "step": 6160 + }, + { + "epoch": 5.191417753470762, + "grad_norm": 1.2163258790969849, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 6170 + }, + { + "epoch": 5.19983172065629, + "grad_norm": 0.9163419604301453, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 6180 + }, + { + "epoch": 5.208245687841817, + "grad_norm": 0.9225585460662842, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6190 + }, + { + "epoch": 5.216659655027345, + "grad_norm": 0.9205296635627747, + "learning_rate": 0.0002, + "loss": 1.2932, + "step": 6200 + }, + { + "epoch": 5.225073622212873, + "grad_norm": 1.0655443668365479, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 6210 + }, + { + "epoch": 5.233487589398401, + "grad_norm": 1.0854865312576294, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6220 + }, + { + "epoch": 5.2419015565839295, + "grad_norm": 0.8489186763763428, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 6230 + }, + { + "epoch": 5.250315523769458, + "grad_norm": 0.910391628742218, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 6240 + }, + { + "epoch": 5.258729490954985, + "grad_norm": 0.925507128238678, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 6250 + }, + { + "epoch": 5.267143458140513, + "grad_norm": 1.1069735288619995, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 6260 + }, + { + "epoch": 5.275557425326041, + "grad_norm": 0.9705119132995605, + "learning_rate": 0.0002, + "loss": 1.2505, + "step": 6270 + }, + { + "epoch": 5.283971392511569, + "grad_norm": 0.9752426147460938, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6280 + }, + { + "epoch": 5.292385359697097, + "grad_norm": 1.021359920501709, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6290 + }, + { + "epoch": 5.3007993268826255, + "grad_norm": 1.148606300354004, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6300 + }, + { + "epoch": 5.309213294068153, + "grad_norm": 0.8909247517585754, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 6310 + }, + { + "epoch": 5.317627261253681, + "grad_norm": 0.9879156351089478, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 6320 + }, + { + "epoch": 5.326041228439209, + "grad_norm": 0.9473357200622559, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6330 + }, + { + "epoch": 5.334455195624737, + "grad_norm": 1.1422028541564941, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6340 + }, + { + "epoch": 5.342869162810265, + "grad_norm": 0.9942235350608826, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6350 + }, + { + "epoch": 5.351283129995793, + "grad_norm": 0.9535723924636841, + "learning_rate": 0.0002, + "loss": 1.3032, + "step": 6360 + }, + { + "epoch": 5.359697097181321, + "grad_norm": 0.9020892381668091, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6370 + }, + { + "epoch": 5.368111064366849, + "grad_norm": 1.0626472234725952, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6380 + }, + { + "epoch": 5.376525031552377, + "grad_norm": 1.1395848989486694, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 6390 + }, + { + "epoch": 5.384938998737905, + "grad_norm": 0.9274451732635498, + "learning_rate": 0.0002, + "loss": 1.2839, + "step": 6400 + }, + { + "epoch": 5.393352965923433, + "grad_norm": 0.8108699917793274, + "learning_rate": 0.0002, + "loss": 1.2819, + "step": 6410 + }, + { + "epoch": 5.401766933108961, + "grad_norm": 1.1805564165115356, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6420 + }, + { + "epoch": 5.410180900294489, + "grad_norm": 0.8321298360824585, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 6430 + }, + { + "epoch": 5.418594867480017, + "grad_norm": 0.8981925249099731, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 6440 + }, + { + "epoch": 5.4270088346655445, + "grad_norm": 1.0730986595153809, + "learning_rate": 0.0002, + "loss": 1.258, + "step": 6450 + }, + { + "epoch": 5.435422801851073, + "grad_norm": 1.0584609508514404, + "learning_rate": 0.0002, + "loss": 1.26, + "step": 6460 + }, + { + "epoch": 5.443836769036601, + "grad_norm": 1.0792299509048462, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 6470 + }, + { + "epoch": 5.452250736222129, + "grad_norm": 0.9101872444152832, + "learning_rate": 0.0002, + "loss": 1.2035, + "step": 6480 + }, + { + "epoch": 5.460664703407657, + "grad_norm": 0.9910100698471069, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6490 + }, + { + "epoch": 5.469078670593185, + "grad_norm": 1.041412353515625, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6500 + }, + { + "epoch": 5.477492637778712, + "grad_norm": 1.0091687440872192, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 6510 + }, + { + "epoch": 5.48590660496424, + "grad_norm": 0.8755383491516113, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6520 + }, + { + "epoch": 5.4943205721497685, + "grad_norm": 0.980212390422821, + "learning_rate": 0.0002, + "loss": 1.3042, + "step": 6530 + }, + { + "epoch": 5.502734539335297, + "grad_norm": 0.9356869459152222, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6540 + }, + { + "epoch": 5.511148506520825, + "grad_norm": 0.9008095264434814, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6550 + }, + { + "epoch": 5.519562473706353, + "grad_norm": 0.8908938765525818, + "learning_rate": 0.0002, + "loss": 1.2818, + "step": 6560 + }, + { + "epoch": 5.52797644089188, + "grad_norm": 1.1423932313919067, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6570 + }, + { + "epoch": 5.536390408077408, + "grad_norm": 1.0508161783218384, + "learning_rate": 0.0002, + "loss": 1.3039, + "step": 6580 + }, + { + "epoch": 5.544804375262936, + "grad_norm": 0.8357517719268799, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6590 + }, + { + "epoch": 5.5532183424484645, + "grad_norm": 0.9892540574073792, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6600 + }, + { + "epoch": 5.561632309633993, + "grad_norm": 1.0048326253890991, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6610 + }, + { + "epoch": 5.570046276819521, + "grad_norm": 0.9801995158195496, + "learning_rate": 0.0002, + "loss": 1.2152, + "step": 6620 + }, + { + "epoch": 5.578460244005049, + "grad_norm": 0.9899214506149292, + "learning_rate": 0.0002, + "loss": 1.2606, + "step": 6630 + }, + { + "epoch": 5.586874211190576, + "grad_norm": 1.1911814212799072, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6640 + }, + { + "epoch": 5.595288178376104, + "grad_norm": 1.0368894338607788, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 6650 + }, + { + "epoch": 5.603702145561632, + "grad_norm": 1.1248382329940796, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 6660 + }, + { + "epoch": 5.61211611274716, + "grad_norm": 0.9765539765357971, + "learning_rate": 0.0002, + "loss": 1.2548, + "step": 6670 + }, + { + "epoch": 5.6205300799326885, + "grad_norm": 0.9810206890106201, + "learning_rate": 0.0002, + "loss": 1.3451, + "step": 6680 + }, + { + "epoch": 5.628944047118217, + "grad_norm": 1.100386619567871, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6690 + }, + { + "epoch": 5.637358014303744, + "grad_norm": 0.8824519515037537, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6700 + }, + { + "epoch": 5.645771981489272, + "grad_norm": 1.0864064693450928, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6710 + }, + { + "epoch": 5.6541859486748, + "grad_norm": 1.1614511013031006, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 6720 + }, + { + "epoch": 5.662599915860328, + "grad_norm": 1.0762972831726074, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 6730 + }, + { + "epoch": 5.671013883045856, + "grad_norm": 0.9408974647521973, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 6740 + }, + { + "epoch": 5.679427850231384, + "grad_norm": 0.8906030058860779, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6750 + }, + { + "epoch": 5.687841817416912, + "grad_norm": 0.9527303576469421, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 6760 + }, + { + "epoch": 5.69625578460244, + "grad_norm": 0.9471196532249451, + "learning_rate": 0.0002, + "loss": 1.322, + "step": 6770 + }, + { + "epoch": 5.704669751787968, + "grad_norm": 0.9186838865280151, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6780 + }, + { + "epoch": 5.713083718973496, + "grad_norm": 0.9225441813468933, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 6790 + }, + { + "epoch": 5.721497686159024, + "grad_norm": 0.9712982773780823, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6800 + }, + { + "epoch": 5.729911653344552, + "grad_norm": 1.0743170976638794, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 6810 + }, + { + "epoch": 5.73832562053008, + "grad_norm": 1.2738113403320312, + "learning_rate": 0.0002, + "loss": 1.2136, + "step": 6820 + }, + { + "epoch": 5.7467395877156076, + "grad_norm": 0.9386790990829468, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6830 + }, + { + "epoch": 5.755153554901136, + "grad_norm": 1.0817769765853882, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 6840 + }, + { + "epoch": 5.763567522086664, + "grad_norm": 1.1040263175964355, + "learning_rate": 0.0002, + "loss": 1.2247, + "step": 6850 + }, + { + "epoch": 5.771981489272192, + "grad_norm": 1.0656492710113525, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6860 + }, + { + "epoch": 5.78039545645772, + "grad_norm": 0.9550157189369202, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 6870 + }, + { + "epoch": 5.788809423643248, + "grad_norm": 1.0130870342254639, + "learning_rate": 0.0002, + "loss": 1.3201, + "step": 6880 + }, + { + "epoch": 5.797223390828776, + "grad_norm": 1.0675787925720215, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 6890 + }, + { + "epoch": 5.8056373580143035, + "grad_norm": 0.9537774920463562, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6900 + }, + { + "epoch": 5.814051325199832, + "grad_norm": 0.9640319347381592, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6910 + }, + { + "epoch": 5.82246529238536, + "grad_norm": 0.8917992115020752, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 6920 + }, + { + "epoch": 5.830879259570888, + "grad_norm": 0.9881822466850281, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 6930 + }, + { + "epoch": 5.839293226756416, + "grad_norm": 0.9136882424354553, + "learning_rate": 0.0002, + "loss": 1.323, + "step": 6940 + }, + { + "epoch": 5.847707193941943, + "grad_norm": 0.9086098074913025, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 6950 + }, + { + "epoch": 5.856121161127471, + "grad_norm": 0.9443018436431885, + "learning_rate": 0.0002, + "loss": 1.2624, + "step": 6960 + }, + { + "epoch": 5.864535128312999, + "grad_norm": 0.9915381669998169, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 6970 + }, + { + "epoch": 5.8729490954985275, + "grad_norm": 0.8939146995544434, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 6980 + }, + { + "epoch": 5.881363062684056, + "grad_norm": 1.3672245740890503, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 6990 + }, + { + "epoch": 5.889777029869584, + "grad_norm": 1.0116257667541504, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 7000 + }, + { + "epoch": 5.898190997055112, + "grad_norm": 1.1561565399169922, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 7010 + }, + { + "epoch": 5.906604964240639, + "grad_norm": 0.9900678992271423, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 7020 + }, + { + "epoch": 5.915018931426167, + "grad_norm": 0.9297345876693726, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 7030 + }, + { + "epoch": 5.923432898611695, + "grad_norm": 0.9357825517654419, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 7040 + }, + { + "epoch": 5.931846865797223, + "grad_norm": 1.049317717552185, + "learning_rate": 0.0002, + "loss": 1.2303, + "step": 7050 + }, + { + "epoch": 5.9402608329827515, + "grad_norm": 0.950633704662323, + "learning_rate": 0.0002, + "loss": 1.3243, + "step": 7060 + }, + { + "epoch": 5.94867480016828, + "grad_norm": 0.854581892490387, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 7070 + }, + { + "epoch": 5.957088767353808, + "grad_norm": 0.9097039699554443, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 7080 + }, + { + "epoch": 5.965502734539335, + "grad_norm": 0.9072173237800598, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 7090 + }, + { + "epoch": 5.973916701724863, + "grad_norm": 1.0470727682113647, + "learning_rate": 0.0002, + "loss": 1.2724, + "step": 7100 + }, + { + "epoch": 5.982330668910391, + "grad_norm": 1.2628462314605713, + "learning_rate": 0.0002, + "loss": 1.3324, + "step": 7110 + }, + { + "epoch": 5.990744636095919, + "grad_norm": 1.055279016494751, + "learning_rate": 0.0002, + "loss": 1.2701, + "step": 7120 + }, + { + "epoch": 5.9991586032814475, + "grad_norm": 0.966194212436676, + "learning_rate": 0.0002, + "loss": 1.3234, + "step": 7130 + }, + { + "epoch": 6.0, + "eval_loss": 2.0427448749542236, + "eval_runtime": 37.8426, + "eval_samples_per_second": 13.609, + "eval_steps_per_second": 1.718, + "step": 7131 + }, + { + "epoch": 6.007572570466976, + "grad_norm": 1.4037928581237793, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 7140 + }, + { + "epoch": 6.015986537652503, + "grad_norm": 1.1081010103225708, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 7150 + }, + { + "epoch": 6.024400504838031, + "grad_norm": 1.1585499048233032, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 7160 + }, + { + "epoch": 6.032814472023559, + "grad_norm": 1.0822780132293701, + "learning_rate": 0.0002, + "loss": 1.0192, + "step": 7170 + }, + { + "epoch": 6.041228439209087, + "grad_norm": 0.9662094712257385, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 7180 + }, + { + "epoch": 6.049642406394615, + "grad_norm": 1.063936710357666, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 7190 + }, + { + "epoch": 6.058056373580143, + "grad_norm": 1.0349032878875732, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 7200 + }, + { + "epoch": 6.066470340765671, + "grad_norm": 1.0312575101852417, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 7210 + }, + { + "epoch": 6.074884307951199, + "grad_norm": 1.1942846775054932, + "learning_rate": 0.0002, + "loss": 1.1824, + "step": 7220 + }, + { + "epoch": 6.083298275136727, + "grad_norm": 1.0816049575805664, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 7230 + }, + { + "epoch": 6.091712242322255, + "grad_norm": 0.9985513687133789, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7240 + }, + { + "epoch": 6.100126209507783, + "grad_norm": 1.2573972940444946, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 7250 + }, + { + "epoch": 6.108540176693311, + "grad_norm": 1.1182395219802856, + "learning_rate": 0.0002, + "loss": 1.1051, + "step": 7260 + }, + { + "epoch": 6.116954143878839, + "grad_norm": 0.9679344296455383, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 7270 + }, + { + "epoch": 6.1253681110643665, + "grad_norm": 1.0913981199264526, + "learning_rate": 0.0002, + "loss": 1.1192, + "step": 7280 + }, + { + "epoch": 6.133782078249895, + "grad_norm": 1.1291013956069946, + "learning_rate": 0.0002, + "loss": 1.0411, + "step": 7290 + }, + { + "epoch": 6.142196045435423, + "grad_norm": 1.2679595947265625, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 7300 + }, + { + "epoch": 6.150610012620951, + "grad_norm": 1.2350026369094849, + "learning_rate": 0.0002, + "loss": 1.0875, + "step": 7310 + }, + { + "epoch": 6.159023979806479, + "grad_norm": 1.3213104009628296, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 7320 + }, + { + "epoch": 6.167437946992007, + "grad_norm": 1.1924850940704346, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 7330 + }, + { + "epoch": 6.175851914177534, + "grad_norm": 1.1890000104904175, + "learning_rate": 0.0002, + "loss": 1.1242, + "step": 7340 + }, + { + "epoch": 6.184265881363062, + "grad_norm": 1.3821455240249634, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 7350 + }, + { + "epoch": 6.1926798485485905, + "grad_norm": 1.1217057704925537, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 7360 + }, + { + "epoch": 6.201093815734119, + "grad_norm": 1.2441548109054565, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 7370 + }, + { + "epoch": 6.209507782919647, + "grad_norm": 1.0837615728378296, + "learning_rate": 0.0002, + "loss": 1.1199, + "step": 7380 + }, + { + "epoch": 6.217921750105175, + "grad_norm": 1.164304256439209, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 7390 + }, + { + "epoch": 6.226335717290702, + "grad_norm": 1.3129467964172363, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 7400 + }, + { + "epoch": 6.23474968447623, + "grad_norm": 1.1938153505325317, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 7410 + }, + { + "epoch": 6.243163651661758, + "grad_norm": 1.4348443746566772, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 7420 + }, + { + "epoch": 6.2515776188472865, + "grad_norm": 1.132301926612854, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 7430 + }, + { + "epoch": 6.259991586032815, + "grad_norm": 1.136966586112976, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 7440 + }, + { + "epoch": 6.268405553218343, + "grad_norm": 1.12801194190979, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 7450 + }, + { + "epoch": 6.276819520403871, + "grad_norm": 1.0246902704238892, + "learning_rate": 0.0002, + "loss": 1.0408, + "step": 7460 + }, + { + "epoch": 6.285233487589398, + "grad_norm": 1.1066974401474, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 7470 + }, + { + "epoch": 6.293647454774926, + "grad_norm": 1.012710690498352, + "learning_rate": 0.0002, + "loss": 1.1589, + "step": 7480 + }, + { + "epoch": 6.302061421960454, + "grad_norm": 1.2227119207382202, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 7490 + }, + { + "epoch": 6.310475389145982, + "grad_norm": 0.9736923575401306, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 7500 + }, + { + "epoch": 6.3188893563315105, + "grad_norm": 1.2945268154144287, + "learning_rate": 0.0002, + "loss": 1.1017, + "step": 7510 + }, + { + "epoch": 6.327303323517039, + "grad_norm": 1.1579312086105347, + "learning_rate": 0.0002, + "loss": 1.0724, + "step": 7520 + }, + { + "epoch": 6.335717290702567, + "grad_norm": 1.2404558658599854, + "learning_rate": 0.0002, + "loss": 1.0899, + "step": 7530 + }, + { + "epoch": 6.344131257888094, + "grad_norm": 1.4673258066177368, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 7540 + }, + { + "epoch": 6.352545225073622, + "grad_norm": 1.2268997430801392, + "learning_rate": 0.0002, + "loss": 1.128, + "step": 7550 + }, + { + "epoch": 6.36095919225915, + "grad_norm": 0.9772747159004211, + "learning_rate": 0.0002, + "loss": 1.0932, + "step": 7560 + }, + { + "epoch": 6.369373159444678, + "grad_norm": 1.0205204486846924, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 7570 + }, + { + "epoch": 6.377787126630206, + "grad_norm": 1.2227109670639038, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 7580 + }, + { + "epoch": 6.3862010938157345, + "grad_norm": 1.0708507299423218, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 7590 + }, + { + "epoch": 6.394615061001262, + "grad_norm": 1.1427522897720337, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 7600 + }, + { + "epoch": 6.40302902818679, + "grad_norm": 1.0706431865692139, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 7610 + }, + { + "epoch": 6.411442995372318, + "grad_norm": 1.1358282566070557, + "learning_rate": 0.0002, + "loss": 1.0933, + "step": 7620 + }, + { + "epoch": 6.419856962557846, + "grad_norm": 1.4011822938919067, + "learning_rate": 0.0002, + "loss": 1.1075, + "step": 7630 + }, + { + "epoch": 6.428270929743374, + "grad_norm": 1.5616450309753418, + "learning_rate": 0.0002, + "loss": 1.1269, + "step": 7640 + }, + { + "epoch": 6.436684896928902, + "grad_norm": 1.1442687511444092, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 7650 + }, + { + "epoch": 6.44509886411443, + "grad_norm": 1.164803147315979, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 7660 + }, + { + "epoch": 6.453512831299958, + "grad_norm": 1.3184553384780884, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 7670 + }, + { + "epoch": 6.461926798485486, + "grad_norm": 1.2701894044876099, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 7680 + }, + { + "epoch": 6.470340765671014, + "grad_norm": 1.1998416185379028, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 7690 + }, + { + "epoch": 6.478754732856542, + "grad_norm": 1.156459927558899, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 7700 + }, + { + "epoch": 6.48716870004207, + "grad_norm": 1.0217190980911255, + "learning_rate": 0.0002, + "loss": 1.2122, + "step": 7710 + }, + { + "epoch": 6.495582667227598, + "grad_norm": 1.230372428894043, + "learning_rate": 0.0002, + "loss": 1.0917, + "step": 7720 + }, + { + "epoch": 6.5039966344131255, + "grad_norm": 1.105675220489502, + "learning_rate": 0.0002, + "loss": 1.119, + "step": 7730 + }, + { + "epoch": 6.512410601598654, + "grad_norm": 1.1623669862747192, + "learning_rate": 0.0002, + "loss": 1.0758, + "step": 7740 + }, + { + "epoch": 6.520824568784182, + "grad_norm": 1.2884684801101685, + "learning_rate": 0.0002, + "loss": 1.1548, + "step": 7750 + }, + { + "epoch": 6.52923853596971, + "grad_norm": 1.1785279512405396, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 7760 + }, + { + "epoch": 6.537652503155238, + "grad_norm": 1.0607101917266846, + "learning_rate": 0.0002, + "loss": 1.1598, + "step": 7770 + }, + { + "epoch": 6.546066470340766, + "grad_norm": 1.21990168094635, + "learning_rate": 0.0002, + "loss": 1.1472, + "step": 7780 + }, + { + "epoch": 6.554480437526293, + "grad_norm": 1.1498621702194214, + "learning_rate": 0.0002, + "loss": 1.1468, + "step": 7790 + }, + { + "epoch": 6.562894404711821, + "grad_norm": 1.263929009437561, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 7800 + }, + { + "epoch": 6.5713083718973495, + "grad_norm": 1.1580625772476196, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 7810 + }, + { + "epoch": 6.579722339082878, + "grad_norm": 1.4431294202804565, + "learning_rate": 0.0002, + "loss": 1.1313, + "step": 7820 + }, + { + "epoch": 6.588136306268406, + "grad_norm": 1.1309990882873535, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 7830 + }, + { + "epoch": 6.596550273453934, + "grad_norm": 1.0543386936187744, + "learning_rate": 0.0002, + "loss": 1.1156, + "step": 7840 + }, + { + "epoch": 6.604964240639461, + "grad_norm": 1.2180639505386353, + "learning_rate": 0.0002, + "loss": 1.0945, + "step": 7850 + }, + { + "epoch": 6.613378207824989, + "grad_norm": 1.0631271600723267, + "learning_rate": 0.0002, + "loss": 1.1318, + "step": 7860 + }, + { + "epoch": 6.621792175010517, + "grad_norm": 1.138885498046875, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 7870 + }, + { + "epoch": 6.630206142196045, + "grad_norm": 1.1117745637893677, + "learning_rate": 0.0002, + "loss": 1.1805, + "step": 7880 + }, + { + "epoch": 6.6386201093815735, + "grad_norm": 1.3734886646270752, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 7890 + }, + { + "epoch": 6.647034076567102, + "grad_norm": 1.236003041267395, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 7900 + }, + { + "epoch": 6.65544804375263, + "grad_norm": 1.2206000089645386, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 7910 + }, + { + "epoch": 6.663862010938157, + "grad_norm": 1.2842656373977661, + "learning_rate": 0.0002, + "loss": 1.1637, + "step": 7920 + }, + { + "epoch": 6.672275978123685, + "grad_norm": 1.2365005016326904, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 7930 + }, + { + "epoch": 6.680689945309213, + "grad_norm": 1.256620168685913, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 7940 + }, + { + "epoch": 6.689103912494741, + "grad_norm": 1.3232917785644531, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 7950 + }, + { + "epoch": 6.6975178796802695, + "grad_norm": 1.2470088005065918, + "learning_rate": 0.0002, + "loss": 1.2042, + "step": 7960 + }, + { + "epoch": 6.705931846865798, + "grad_norm": 1.0511926412582397, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 7970 + }, + { + "epoch": 6.714345814051326, + "grad_norm": 1.107310175895691, + "learning_rate": 0.0002, + "loss": 1.118, + "step": 7980 + }, + { + "epoch": 6.722759781236853, + "grad_norm": 1.4069843292236328, + "learning_rate": 0.0002, + "loss": 1.2109, + "step": 7990 + }, + { + "epoch": 6.731173748422381, + "grad_norm": 1.0800836086273193, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 8000 + }, + { + "epoch": 6.739587715607909, + "grad_norm": 1.1676300764083862, + "learning_rate": 0.0002, + "loss": 1.1824, + "step": 8010 + }, + { + "epoch": 6.748001682793437, + "grad_norm": 1.0579663515090942, + "learning_rate": 0.0002, + "loss": 1.1253, + "step": 8020 + }, + { + "epoch": 6.756415649978965, + "grad_norm": 1.2770029306411743, + "learning_rate": 0.0002, + "loss": 1.1542, + "step": 8030 + }, + { + "epoch": 6.764829617164493, + "grad_norm": 1.0981038808822632, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 8040 + }, + { + "epoch": 6.773243584350021, + "grad_norm": 1.1194742918014526, + "learning_rate": 0.0002, + "loss": 1.1422, + "step": 8050 + }, + { + "epoch": 6.781657551535549, + "grad_norm": 1.0130012035369873, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 8060 + }, + { + "epoch": 6.790071518721077, + "grad_norm": 1.2051167488098145, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 8070 + }, + { + "epoch": 6.798485485906605, + "grad_norm": 1.095689058303833, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 8080 + }, + { + "epoch": 6.806899453092133, + "grad_norm": 1.2275174856185913, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 8090 + }, + { + "epoch": 6.815313420277661, + "grad_norm": 1.1439805030822754, + "learning_rate": 0.0002, + "loss": 1.1453, + "step": 8100 + }, + { + "epoch": 6.8237273874631885, + "grad_norm": 1.276331901550293, + "learning_rate": 0.0002, + "loss": 1.1624, + "step": 8110 + }, + { + "epoch": 6.832141354648717, + "grad_norm": 1.0450139045715332, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 8120 + }, + { + "epoch": 6.840555321834245, + "grad_norm": 1.1189453601837158, + "learning_rate": 0.0002, + "loss": 1.1783, + "step": 8130 + }, + { + "epoch": 6.848969289019773, + "grad_norm": 1.194640874862671, + "learning_rate": 0.0002, + "loss": 1.1093, + "step": 8140 + }, + { + "epoch": 6.857383256205301, + "grad_norm": 1.095372200012207, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 8150 + }, + { + "epoch": 6.865797223390829, + "grad_norm": 1.2416104078292847, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 8160 + }, + { + "epoch": 6.874211190576357, + "grad_norm": 1.2402868270874023, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 8170 + }, + { + "epoch": 6.882625157761884, + "grad_norm": 1.1317291259765625, + "learning_rate": 0.0002, + "loss": 1.1306, + "step": 8180 + }, + { + "epoch": 6.8910391249474126, + "grad_norm": 1.0581914186477661, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 8190 + }, + { + "epoch": 6.899453092132941, + "grad_norm": 1.3540890216827393, + "learning_rate": 0.0002, + "loss": 1.1271, + "step": 8200 + }, + { + "epoch": 6.907867059318469, + "grad_norm": 1.213672399520874, + "learning_rate": 0.0002, + "loss": 1.2119, + "step": 8210 + }, + { + "epoch": 6.916281026503997, + "grad_norm": 1.2654485702514648, + "learning_rate": 0.0002, + "loss": 1.1406, + "step": 8220 + }, + { + "epoch": 6.924694993689524, + "grad_norm": 1.203903317451477, + "learning_rate": 0.0002, + "loss": 1.205, + "step": 8230 + }, + { + "epoch": 6.933108960875052, + "grad_norm": 1.1332030296325684, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 8240 + }, + { + "epoch": 6.94152292806058, + "grad_norm": 1.2699192762374878, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 8250 + }, + { + "epoch": 6.9499368952461085, + "grad_norm": 1.2728958129882812, + "learning_rate": 0.0002, + "loss": 1.1831, + "step": 8260 + }, + { + "epoch": 6.958350862431637, + "grad_norm": 1.238410472869873, + "learning_rate": 0.0002, + "loss": 1.1757, + "step": 8270 + }, + { + "epoch": 6.966764829617165, + "grad_norm": 1.403863549232483, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 8280 + }, + { + "epoch": 6.975178796802693, + "grad_norm": 1.1096396446228027, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 8290 + }, + { + "epoch": 6.98359276398822, + "grad_norm": 1.1043379306793213, + "learning_rate": 0.0002, + "loss": 1.2049, + "step": 8300 + }, + { + "epoch": 6.992006731173748, + "grad_norm": 1.391754388809204, + "learning_rate": 0.0002, + "loss": 1.1255, + "step": 8310 + }, + { + "epoch": 6.999579301640724, + "eval_loss": 2.1421656608581543, + "eval_runtime": 37.8262, + "eval_samples_per_second": 13.615, + "eval_steps_per_second": 1.718, + "step": 8319 + }, + { + "epoch": 7.000420698359276, + "grad_norm": 1.1739230155944824, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 8320 + }, + { + "epoch": 7.008834665544804, + "grad_norm": 1.5428645610809326, + "learning_rate": 0.0002, + "loss": 1.0066, + "step": 8330 + }, + { + "epoch": 7.0172486327303325, + "grad_norm": 1.307463526725769, + "learning_rate": 0.0002, + "loss": 0.9885, + "step": 8340 + }, + { + "epoch": 7.025662599915861, + "grad_norm": 1.4964789152145386, + "learning_rate": 0.0002, + "loss": 0.9098, + "step": 8350 + }, + { + "epoch": 7.034076567101389, + "grad_norm": 1.2289477586746216, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 8360 + }, + { + "epoch": 7.042490534286916, + "grad_norm": 1.325327754020691, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 8370 + }, + { + "epoch": 7.050904501472444, + "grad_norm": 1.4672988653182983, + "learning_rate": 0.0002, + "loss": 0.8967, + "step": 8380 + }, + { + "epoch": 7.059318468657972, + "grad_norm": 1.4184634685516357, + "learning_rate": 0.0002, + "loss": 0.8927, + "step": 8390 + }, + { + "epoch": 7.0677324358435, + "grad_norm": 1.3103536367416382, + "learning_rate": 0.0002, + "loss": 0.9129, + "step": 8400 + }, + { + "epoch": 7.076146403029028, + "grad_norm": 1.2364518642425537, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 8410 + }, + { + "epoch": 7.0845603702145565, + "grad_norm": 1.3712464570999146, + "learning_rate": 0.0002, + "loss": 0.8776, + "step": 8420 + }, + { + "epoch": 7.092974337400084, + "grad_norm": 1.4655892848968506, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 8430 + }, + { + "epoch": 7.101388304585612, + "grad_norm": 1.3276227712631226, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 8440 + }, + { + "epoch": 7.10980227177114, + "grad_norm": 1.1355878114700317, + "learning_rate": 0.0002, + "loss": 0.9695, + "step": 8450 + }, + { + "epoch": 7.118216238956668, + "grad_norm": 1.2767117023468018, + "learning_rate": 0.0002, + "loss": 0.9673, + "step": 8460 + }, + { + "epoch": 7.126630206142196, + "grad_norm": 1.4915258884429932, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 8470 + }, + { + "epoch": 7.135044173327724, + "grad_norm": 1.355043649673462, + "learning_rate": 0.0002, + "loss": 0.9469, + "step": 8480 + }, + { + "epoch": 7.143458140513252, + "grad_norm": 1.0848617553710938, + "learning_rate": 0.0002, + "loss": 0.9626, + "step": 8490 + }, + { + "epoch": 7.15187210769878, + "grad_norm": 1.5321701765060425, + "learning_rate": 0.0002, + "loss": 0.92, + "step": 8500 + }, + { + "epoch": 7.160286074884308, + "grad_norm": 1.4917421340942383, + "learning_rate": 0.0002, + "loss": 0.9787, + "step": 8510 + }, + { + "epoch": 7.168700042069836, + "grad_norm": 1.4249778985977173, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 8520 + }, + { + "epoch": 7.177114009255364, + "grad_norm": 1.5257216691970825, + "learning_rate": 0.0002, + "loss": 0.9023, + "step": 8530 + }, + { + "epoch": 7.185527976440892, + "grad_norm": 1.4094327688217163, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 8540 + }, + { + "epoch": 7.19394194362642, + "grad_norm": 1.5506917238235474, + "learning_rate": 0.0002, + "loss": 0.9676, + "step": 8550 + }, + { + "epoch": 7.2023559108119475, + "grad_norm": 1.336599588394165, + "learning_rate": 0.0002, + "loss": 1.0494, + "step": 8560 + }, + { + "epoch": 7.210769877997476, + "grad_norm": 1.2018364667892456, + "learning_rate": 0.0002, + "loss": 0.9902, + "step": 8570 + }, + { + "epoch": 7.219183845183004, + "grad_norm": 1.198525071144104, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 8580 + }, + { + "epoch": 7.227597812368532, + "grad_norm": 1.4427133798599243, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 8590 + }, + { + "epoch": 7.23601177955406, + "grad_norm": 1.3134386539459229, + "learning_rate": 0.0002, + "loss": 0.9827, + "step": 8600 + }, + { + "epoch": 7.244425746739588, + "grad_norm": 1.4141706228256226, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 8610 + }, + { + "epoch": 7.252839713925115, + "grad_norm": 1.4951153993606567, + "learning_rate": 0.0002, + "loss": 1.023, + "step": 8620 + }, + { + "epoch": 7.261253681110643, + "grad_norm": 1.383599042892456, + "learning_rate": 0.0002, + "loss": 0.9595, + "step": 8630 + }, + { + "epoch": 7.2696676482961715, + "grad_norm": 1.2315951585769653, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 8640 + }, + { + "epoch": 7.2780816154817, + "grad_norm": 1.253337025642395, + "learning_rate": 0.0002, + "loss": 0.9946, + "step": 8650 + }, + { + "epoch": 7.286495582667228, + "grad_norm": 1.2234476804733276, + "learning_rate": 0.0002, + "loss": 1.0381, + "step": 8660 + }, + { + "epoch": 7.294909549852756, + "grad_norm": 1.395650863647461, + "learning_rate": 0.0002, + "loss": 0.9774, + "step": 8670 + }, + { + "epoch": 7.303323517038283, + "grad_norm": 1.2411445379257202, + "learning_rate": 0.0002, + "loss": 0.9234, + "step": 8680 + }, + { + "epoch": 7.311737484223811, + "grad_norm": 1.22808837890625, + "learning_rate": 0.0002, + "loss": 0.975, + "step": 8690 + }, + { + "epoch": 7.320151451409339, + "grad_norm": 1.5197114944458008, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 8700 + }, + { + "epoch": 7.328565418594867, + "grad_norm": 1.3072681427001953, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 8710 + }, + { + "epoch": 7.3369793857803955, + "grad_norm": 1.3035615682601929, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 8720 + }, + { + "epoch": 7.345393352965924, + "grad_norm": 1.2765713930130005, + "learning_rate": 0.0002, + "loss": 0.9666, + "step": 8730 + }, + { + "epoch": 7.353807320151452, + "grad_norm": 1.419601321220398, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 8740 + }, + { + "epoch": 7.362221287336979, + "grad_norm": 1.376158595085144, + "learning_rate": 0.0002, + "loss": 1.0378, + "step": 8750 + }, + { + "epoch": 7.370635254522507, + "grad_norm": 1.3880754709243774, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 8760 + }, + { + "epoch": 7.379049221708035, + "grad_norm": 1.2978262901306152, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 8770 + }, + { + "epoch": 7.387463188893563, + "grad_norm": 1.5811840295791626, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 8780 + }, + { + "epoch": 7.3958771560790915, + "grad_norm": 1.3790863752365112, + "learning_rate": 0.0002, + "loss": 0.9977, + "step": 8790 + }, + { + "epoch": 7.40429112326462, + "grad_norm": 1.475306510925293, + "learning_rate": 0.0002, + "loss": 1.008, + "step": 8800 + }, + { + "epoch": 7.412705090450148, + "grad_norm": 1.1038212776184082, + "learning_rate": 0.0002, + "loss": 0.9752, + "step": 8810 + }, + { + "epoch": 7.421119057635675, + "grad_norm": 1.5204451084136963, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 8820 + }, + { + "epoch": 7.429533024821203, + "grad_norm": 1.7151343822479248, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 8830 + }, + { + "epoch": 7.437946992006731, + "grad_norm": 1.128046989440918, + "learning_rate": 0.0002, + "loss": 1.0038, + "step": 8840 + }, + { + "epoch": 7.446360959192259, + "grad_norm": 1.5780670642852783, + "learning_rate": 0.0002, + "loss": 1.0377, + "step": 8850 + }, + { + "epoch": 7.454774926377787, + "grad_norm": 1.3571979999542236, + "learning_rate": 0.0002, + "loss": 1.0584, + "step": 8860 + }, + { + "epoch": 7.4631888935633155, + "grad_norm": 1.2764537334442139, + "learning_rate": 0.0002, + "loss": 1.0141, + "step": 8870 + }, + { + "epoch": 7.471602860748843, + "grad_norm": 1.3429038524627686, + "learning_rate": 0.0002, + "loss": 0.9982, + "step": 8880 + }, + { + "epoch": 7.480016827934371, + "grad_norm": 1.3288369178771973, + "learning_rate": 0.0002, + "loss": 0.9671, + "step": 8890 + }, + { + "epoch": 7.488430795119899, + "grad_norm": 1.360141396522522, + "learning_rate": 0.0002, + "loss": 0.9461, + "step": 8900 + }, + { + "epoch": 7.496844762305427, + "grad_norm": 1.31229829788208, + "learning_rate": 0.0002, + "loss": 1.0278, + "step": 8910 + }, + { + "epoch": 7.505258729490955, + "grad_norm": 1.530605435371399, + "learning_rate": 0.0002, + "loss": 0.9945, + "step": 8920 + }, + { + "epoch": 7.513672696676483, + "grad_norm": 1.2880185842514038, + "learning_rate": 0.0002, + "loss": 1.0442, + "step": 8930 + }, + { + "epoch": 7.5220866638620105, + "grad_norm": 1.3219470977783203, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 8940 + }, + { + "epoch": 7.530500631047539, + "grad_norm": 1.565633773803711, + "learning_rate": 0.0002, + "loss": 1.0664, + "step": 8950 + }, + { + "epoch": 7.538914598233067, + "grad_norm": 1.4392317533493042, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 8960 + }, + { + "epoch": 7.547328565418595, + "grad_norm": 1.4557991027832031, + "learning_rate": 0.0002, + "loss": 1.0214, + "step": 8970 + }, + { + "epoch": 7.555742532604123, + "grad_norm": 1.3411110639572144, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 8980 + }, + { + "epoch": 7.564156499789651, + "grad_norm": 1.333378791809082, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 8990 + }, + { + "epoch": 7.572570466975179, + "grad_norm": 1.4422006607055664, + "learning_rate": 0.0002, + "loss": 1.0429, + "step": 9000 + }, + { + "epoch": 7.580984434160706, + "grad_norm": 1.2519633769989014, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 9010 + }, + { + "epoch": 7.589398401346235, + "grad_norm": 1.3628246784210205, + "learning_rate": 0.0002, + "loss": 1.0028, + "step": 9020 + }, + { + "epoch": 7.597812368531763, + "grad_norm": 1.35457181930542, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 9030 + }, + { + "epoch": 7.606226335717291, + "grad_norm": 1.4441956281661987, + "learning_rate": 0.0002, + "loss": 0.9929, + "step": 9040 + }, + { + "epoch": 7.614640302902819, + "grad_norm": 1.3812335729599, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 9050 + }, + { + "epoch": 7.623054270088347, + "grad_norm": 1.3576860427856445, + "learning_rate": 0.0002, + "loss": 0.9692, + "step": 9060 + }, + { + "epoch": 7.631468237273874, + "grad_norm": 1.350433588027954, + "learning_rate": 0.0002, + "loss": 1.0259, + "step": 9070 + }, + { + "epoch": 7.639882204459402, + "grad_norm": 1.3413814306259155, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 9080 + }, + { + "epoch": 7.6482961716449305, + "grad_norm": 1.2727786302566528, + "learning_rate": 0.0002, + "loss": 1.016, + "step": 9090 + }, + { + "epoch": 7.656710138830459, + "grad_norm": 1.1601275205612183, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 9100 + }, + { + "epoch": 7.665124106015987, + "grad_norm": 1.5492266416549683, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 9110 + }, + { + "epoch": 7.673538073201515, + "grad_norm": 1.4239033460617065, + "learning_rate": 0.0002, + "loss": 1.0174, + "step": 9120 + }, + { + "epoch": 7.681952040387042, + "grad_norm": 1.4212028980255127, + "learning_rate": 0.0002, + "loss": 0.9972, + "step": 9130 + }, + { + "epoch": 7.69036600757257, + "grad_norm": 1.116467833518982, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 9140 + }, + { + "epoch": 7.698779974758098, + "grad_norm": 1.299910545349121, + "learning_rate": 0.0002, + "loss": 1.0311, + "step": 9150 + }, + { + "epoch": 7.707193941943626, + "grad_norm": 1.404690146446228, + "learning_rate": 0.0002, + "loss": 1.0262, + "step": 9160 + }, + { + "epoch": 7.7156079091291545, + "grad_norm": 1.383244276046753, + "learning_rate": 0.0002, + "loss": 0.9633, + "step": 9170 + }, + { + "epoch": 7.724021876314683, + "grad_norm": 1.5001360177993774, + "learning_rate": 0.0002, + "loss": 1.0563, + "step": 9180 + }, + { + "epoch": 7.732435843500211, + "grad_norm": 1.4455186128616333, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 9190 + }, + { + "epoch": 7.740849810685738, + "grad_norm": 1.294964075088501, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 9200 + }, + { + "epoch": 7.749263777871266, + "grad_norm": 1.31305730342865, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 9210 + }, + { + "epoch": 7.757677745056794, + "grad_norm": 1.3849674463272095, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 9220 + }, + { + "epoch": 7.766091712242322, + "grad_norm": 1.6689352989196777, + "learning_rate": 0.0002, + "loss": 1.0219, + "step": 9230 + }, + { + "epoch": 7.77450567942785, + "grad_norm": 1.416099190711975, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 9240 + }, + { + "epoch": 7.7829196466133785, + "grad_norm": 1.5212045907974243, + "learning_rate": 0.0002, + "loss": 1.0429, + "step": 9250 + }, + { + "epoch": 7.791333613798907, + "grad_norm": 1.3623390197753906, + "learning_rate": 0.0002, + "loss": 1.0607, + "step": 9260 + }, + { + "epoch": 7.799747580984434, + "grad_norm": 1.304148554801941, + "learning_rate": 0.0002, + "loss": 1.0469, + "step": 9270 + }, + { + "epoch": 7.808161548169962, + "grad_norm": 1.3833202123641968, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 9280 + }, + { + "epoch": 7.81657551535549, + "grad_norm": 1.3440886735916138, + "learning_rate": 0.0002, + "loss": 1.0122, + "step": 9290 + }, + { + "epoch": 7.824989482541018, + "grad_norm": 1.2798155546188354, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 9300 + }, + { + "epoch": 7.833403449726546, + "grad_norm": 1.3755156993865967, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 9310 + }, + { + "epoch": 7.841817416912074, + "grad_norm": 1.3145397901535034, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 9320 + }, + { + "epoch": 7.850231384097602, + "grad_norm": 1.6102794408798218, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 9330 + }, + { + "epoch": 7.85864535128313, + "grad_norm": 1.3959331512451172, + "learning_rate": 0.0002, + "loss": 1.1878, + "step": 9340 + }, + { + "epoch": 7.867059318468658, + "grad_norm": 1.4965628385543823, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 9350 + }, + { + "epoch": 7.875473285654186, + "grad_norm": 1.194201946258545, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 9360 + }, + { + "epoch": 7.883887252839714, + "grad_norm": 1.4831446409225464, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 9370 + }, + { + "epoch": 7.892301220025242, + "grad_norm": 1.3473302125930786, + "learning_rate": 0.0002, + "loss": 1.0568, + "step": 9380 + }, + { + "epoch": 7.9007151872107695, + "grad_norm": 1.4373382329940796, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 9390 + }, + { + "epoch": 7.909129154396298, + "grad_norm": 1.4341524839401245, + "learning_rate": 0.0002, + "loss": 0.9892, + "step": 9400 + }, + { + "epoch": 7.917543121581826, + "grad_norm": 1.3210171461105347, + "learning_rate": 0.0002, + "loss": 1.0428, + "step": 9410 + }, + { + "epoch": 7.925957088767354, + "grad_norm": 1.2708462476730347, + "learning_rate": 0.0002, + "loss": 1.0543, + "step": 9420 + }, + { + "epoch": 7.934371055952882, + "grad_norm": 1.4132758378982544, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 9430 + }, + { + "epoch": 7.94278502313841, + "grad_norm": 1.5193610191345215, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 9440 + }, + { + "epoch": 7.951198990323938, + "grad_norm": 1.427832841873169, + "learning_rate": 0.0002, + "loss": 0.967, + "step": 9450 + }, + { + "epoch": 7.959612957509465, + "grad_norm": 1.380478024482727, + "learning_rate": 0.0002, + "loss": 1.0052, + "step": 9460 + }, + { + "epoch": 7.9680269246949935, + "grad_norm": 1.3083926439285278, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 9470 + }, + { + "epoch": 7.976440891880522, + "grad_norm": 1.3049120903015137, + "learning_rate": 0.0002, + "loss": 1.0883, + "step": 9480 + }, + { + "epoch": 7.98485485906605, + "grad_norm": 1.42048978805542, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 9490 + }, + { + "epoch": 7.993268826251578, + "grad_norm": 1.2492578029632568, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 9500 + }, + { + "epoch": 7.996634413125789, + "eval_loss": 2.3033790588378906, + "eval_runtime": 37.8581, + "eval_samples_per_second": 13.603, + "eval_steps_per_second": 1.717, + "step": 9504 + } + ], + "logging_steps": 10, + "max_steps": 9504, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.3982383968210125e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-9504/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1502d478cfbb1424f707352d007b740bde5e373 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b79d3acefeedef5a0229881de39ec68ef9b40046a60d7976a49f7e6b3b936 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a80dc982f56f7b26632463941f9b4334525b3c9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9995793016407236, "step": 1188, "epoch_duration": 1650.0460257530212, "total_accumulated_duration": 1650.0460257530212, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}]} +{"epoch": 2.0, "step": 2377, "epoch_duration": 1658.4773199558258, "total_accumulated_duration": 3308.523345708847, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-1188", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}]} +{"epoch": 2.9995793016407237, "step": 3565, "epoch_duration": 1456.9628908634186, "total_accumulated_duration": 4765.486236572266, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}]} +{"epoch": 4.0, "step": 4754, "epoch_duration": 1272.4875209331512, "total_accumulated_duration": 6037.973757505417, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}, {"eval_loss": 1.8266887664794922, "eval_runtime": 37.9445, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.9995793016407237, "step": 3565}, {"loss": 1.5939, "grad_norm": 0.4523724615573883, "learning_rate": 0.0002, "epoch": 3.003786285233488, "step": 3570}, {"loss": 1.526, "grad_norm": 0.5261380076408386, "learning_rate": 0.0002, "epoch": 3.0122002524190155, "step": 3580}, {"loss": 1.4946, "grad_norm": 0.48664888739585876, "learning_rate": 0.0002, "epoch": 3.0206142196045436, "step": 3590}, {"loss": 1.5193, "grad_norm": 0.5070882439613342, "learning_rate": 0.0002, "epoch": 3.0290281867900717, "step": 3600}, {"loss": 1.5316, "grad_norm": 0.5816011428833008, "learning_rate": 0.0002, "epoch": 3.0374421539755994, "step": 3610}, {"loss": 1.5682, "grad_norm": 0.6610211730003357, "learning_rate": 0.0002, "epoch": 3.0458561211611275, "step": 3620}, {"loss": 1.5699, "grad_norm": 0.5257703065872192, "learning_rate": 0.0002, "epoch": 3.0542700883466556, "step": 3630}, {"loss": 1.4438, "grad_norm": 0.5574390888214111, "learning_rate": 0.0002, "epoch": 3.0626840555321833, "step": 3640}, {"loss": 1.547, "grad_norm": 0.5682297348976135, "learning_rate": 0.0002, "epoch": 3.0710980227177114, "step": 3650}, {"loss": 1.5743, "grad_norm": 0.5798383355140686, "learning_rate": 0.0002, "epoch": 3.0795119899032395, "step": 3660}, {"loss": 1.4339, "grad_norm": 0.5458289980888367, "learning_rate": 0.0002, "epoch": 3.087925957088767, "step": 3670}, {"loss": 1.46, "grad_norm": 0.5599102973937988, "learning_rate": 0.0002, "epoch": 3.0963399242742953, "step": 3680}, {"loss": 1.4589, "grad_norm": 0.5023021697998047, "learning_rate": 0.0002, "epoch": 3.1047538914598234, "step": 3690}, {"loss": 1.5114, "grad_norm": 0.5448206067085266, "learning_rate": 0.0002, "epoch": 3.113167858645351, "step": 3700}, {"loss": 1.4692, "grad_norm": 0.5760458707809448, "learning_rate": 0.0002, "epoch": 3.121581825830879, "step": 3710}, {"loss": 1.4789, "grad_norm": 0.6018968224525452, "learning_rate": 0.0002, "epoch": 3.1299957930164073, "step": 3720}, {"loss": 1.5518, "grad_norm": 0.5767101049423218, "learning_rate": 0.0002, "epoch": 3.1384097602019354, "step": 3730}, {"loss": 1.5032, "grad_norm": 0.5333963632583618, "learning_rate": 0.0002, "epoch": 3.146823727387463, "step": 3740}, {"loss": 1.4812, "grad_norm": 0.5918396711349487, "learning_rate": 0.0002, "epoch": 3.155237694572991, "step": 3750}, {"loss": 1.4618, "grad_norm": 0.5931203365325928, "learning_rate": 0.0002, "epoch": 3.1636516617585193, "step": 3760}, {"loss": 1.5592, "grad_norm": 0.6562168598175049, "learning_rate": 0.0002, "epoch": 3.172065628944047, "step": 3770}, {"loss": 1.4932, "grad_norm": 0.5820156335830688, "learning_rate": 0.0002, "epoch": 3.180479596129575, "step": 3780}, {"loss": 1.4523, "grad_norm": 0.5784737467765808, "learning_rate": 0.0002, "epoch": 3.188893563315103, "step": 3790}, {"loss": 1.498, "grad_norm": 0.5506529808044434, "learning_rate": 0.0002, "epoch": 3.197307530500631, "step": 3800}, {"loss": 1.4819, "grad_norm": 0.6101595163345337, "learning_rate": 0.0002, "epoch": 3.205721497686159, "step": 3810}, {"loss": 1.5185, "grad_norm": 0.5597806572914124, "learning_rate": 0.0002, "epoch": 3.214135464871687, "step": 3820}, {"loss": 1.5664, "grad_norm": 0.5641011595726013, "learning_rate": 0.0002, "epoch": 3.222549432057215, "step": 3830}, {"loss": 1.4702, "grad_norm": 0.5892080068588257, "learning_rate": 0.0002, "epoch": 3.230963399242743, "step": 3840}, {"loss": 1.4194, "grad_norm": 0.6034760475158691, "learning_rate": 0.0002, "epoch": 3.239377366428271, "step": 3850}, {"loss": 1.5499, "grad_norm": 0.5112439393997192, "learning_rate": 0.0002, "epoch": 3.247791333613799, "step": 3860}, {"loss": 1.5132, "grad_norm": 0.56565922498703, "learning_rate": 0.0002, "epoch": 3.256205300799327, "step": 3870}, {"loss": 1.4892, "grad_norm": 0.6155247092247009, "learning_rate": 0.0002, "epoch": 3.264619267984855, "step": 3880}, {"loss": 1.5118, "grad_norm": 0.6064623594284058, "learning_rate": 0.0002, "epoch": 3.273033235170383, "step": 3890}, {"loss": 1.5236, "grad_norm": 0.6313768029212952, "learning_rate": 0.0002, "epoch": 3.2814472023559107, "step": 3900}, {"loss": 1.5551, "grad_norm": 0.5903939008712769, "learning_rate": 0.0002, "epoch": 3.289861169541439, "step": 3910}, {"loss": 1.5703, "grad_norm": 0.5770667195320129, "learning_rate": 0.0002, "epoch": 3.298275136726967, "step": 3920}, {"loss": 1.5159, "grad_norm": 0.5785196423530579, "learning_rate": 0.0002, "epoch": 3.3066891039124946, "step": 3930}, {"loss": 1.5277, "grad_norm": 0.6468310356140137, "learning_rate": 0.0002, "epoch": 3.3151030710980227, "step": 3940}, {"loss": 1.6002, "grad_norm": 0.6200279593467712, "learning_rate": 0.0002, "epoch": 3.323517038283551, "step": 3950}, {"loss": 1.5264, "grad_norm": 0.5779302716255188, "learning_rate": 0.0002, "epoch": 3.3319310054690785, "step": 3960}, {"loss": 1.4861, "grad_norm": 0.5463796854019165, "learning_rate": 0.0002, "epoch": 3.3403449726546066, "step": 3970}, {"loss": 1.541, "grad_norm": 0.6117855906486511, "learning_rate": 0.0002, "epoch": 3.3487589398401347, "step": 3980}, {"loss": 1.5566, "grad_norm": 0.5554766058921814, "learning_rate": 0.0002, "epoch": 3.357172907025663, "step": 3990}, {"loss": 1.5004, "grad_norm": 0.6012870073318481, "learning_rate": 0.0002, "epoch": 3.3655868742111905, "step": 4000}, {"loss": 1.473, "grad_norm": 0.5443974137306213, "learning_rate": 0.0002, "epoch": 3.3740008413967186, "step": 4010}, {"loss": 1.5139, "grad_norm": 0.6636057496070862, "learning_rate": 0.0002, "epoch": 3.3824148085822463, "step": 4020}, {"loss": 1.5141, "grad_norm": 0.5801246166229248, "learning_rate": 0.0002, "epoch": 3.3908287757677744, "step": 4030}, {"loss": 1.5026, "grad_norm": 0.5668839812278748, "learning_rate": 0.0002, "epoch": 3.3992427429533025, "step": 4040}, {"loss": 1.523, "grad_norm": 0.7763481736183167, "learning_rate": 0.0002, "epoch": 3.4076567101388306, "step": 4050}, {"loss": 1.4932, "grad_norm": 0.6675992608070374, "learning_rate": 0.0002, "epoch": 3.4160706773243583, "step": 4060}, {"loss": 1.4959, "grad_norm": 0.6290077567100525, "learning_rate": 0.0002, "epoch": 3.4244846445098864, "step": 4070}, {"loss": 1.5766, "grad_norm": 0.6040239930152893, "learning_rate": 0.0002, "epoch": 3.4328986116954145, "step": 4080}, {"loss": 1.5711, "grad_norm": 0.6237877607345581, "learning_rate": 0.0002, "epoch": 3.441312578880942, "step": 4090}, {"loss": 1.4961, "grad_norm": 0.5343508124351501, "learning_rate": 0.0002, "epoch": 3.4497265460664703, "step": 4100}, {"loss": 1.5123, "grad_norm": 0.6817412972450256, "learning_rate": 0.0002, "epoch": 3.4581405132519984, "step": 4110}, {"loss": 1.5377, "grad_norm": 0.7115170359611511, "learning_rate": 0.0002, "epoch": 3.466554480437526, "step": 4120}, {"loss": 1.5275, "grad_norm": 0.6127332448959351, "learning_rate": 0.0002, "epoch": 3.4749684476230542, "step": 4130}, {"loss": 1.557, "grad_norm": 0.5745994448661804, "learning_rate": 0.0002, "epoch": 3.4833824148085824, "step": 4140}, {"loss": 1.4873, "grad_norm": 0.6248795390129089, "learning_rate": 0.0002, "epoch": 3.49179638199411, "step": 4150}, {"loss": 1.4885, "grad_norm": 0.5821124911308289, "learning_rate": 0.0002, "epoch": 3.500210349179638, "step": 4160}, {"loss": 1.4937, "grad_norm": 0.561416506767273, "learning_rate": 0.0002, "epoch": 3.5086243163651663, "step": 4170}, {"loss": 1.5453, "grad_norm": 0.5848962664604187, "learning_rate": 0.0002, "epoch": 3.5170382835506944, "step": 4180}, {"loss": 1.5892, "grad_norm": 0.5335569977760315, "learning_rate": 0.0002, "epoch": 3.525452250736222, "step": 4190}, {"loss": 1.5152, "grad_norm": 0.547964870929718, "learning_rate": 0.0002, "epoch": 3.53386621792175, "step": 4200}, {"loss": 1.4887, "grad_norm": 0.6157727241516113, "learning_rate": 0.0002, "epoch": 3.542280185107278, "step": 4210}, {"loss": 1.5484, "grad_norm": 0.6163121461868286, "learning_rate": 0.0002, "epoch": 3.550694152292806, "step": 4220}, {"loss": 1.5833, "grad_norm": 0.5844616293907166, "learning_rate": 0.0002, "epoch": 3.559108119478334, "step": 4230}, {"loss": 1.5305, "grad_norm": 0.7104926109313965, "learning_rate": 0.0002, "epoch": 3.567522086663862, "step": 4240}, {"loss": 1.5161, "grad_norm": 0.5055213570594788, "learning_rate": 0.0002, "epoch": 3.57593605384939, "step": 4250}, {"loss": 1.482, "grad_norm": 0.611676812171936, "learning_rate": 0.0002, "epoch": 3.584350021034918, "step": 4260}, {"loss": 1.5048, "grad_norm": 0.6326440572738647, "learning_rate": 0.0002, "epoch": 3.592763988220446, "step": 4270}, {"loss": 1.5122, "grad_norm": 0.6290925741195679, "learning_rate": 0.0002, "epoch": 3.6011779554059737, "step": 4280}, {"loss": 1.5654, "grad_norm": 0.5691978931427002, "learning_rate": 0.0002, "epoch": 3.609591922591502, "step": 4290}, {"loss": 1.4854, "grad_norm": 0.6071329116821289, "learning_rate": 0.0002, "epoch": 3.61800588977703, "step": 4300}, {"loss": 1.5336, "grad_norm": 0.606573224067688, "learning_rate": 0.0002, "epoch": 3.626419856962558, "step": 4310}, {"loss": 1.6437, "grad_norm": 0.5515419244766235, "learning_rate": 0.0002, "epoch": 3.6348338241480858, "step": 4320}, {"loss": 1.498, "grad_norm": 0.5964660048484802, "learning_rate": 0.0002, "epoch": 3.643247791333614, "step": 4330}, {"loss": 1.544, "grad_norm": 0.5774146914482117, "learning_rate": 0.0002, "epoch": 3.6516617585191415, "step": 4340}, {"loss": 1.5566, "grad_norm": 0.5732731223106384, "learning_rate": 0.0002, "epoch": 3.6600757257046697, "step": 4350}, {"loss": 1.5682, "grad_norm": 0.7354163527488708, "learning_rate": 0.0002, "epoch": 3.6684896928901978, "step": 4360}, {"loss": 1.5225, "grad_norm": 0.6220902800559998, "learning_rate": 0.0002, "epoch": 3.676903660075726, "step": 4370}, {"loss": 1.4838, "grad_norm": 0.6053991317749023, "learning_rate": 0.0002, "epoch": 3.6853176272612536, "step": 4380}, {"loss": 1.5161, "grad_norm": 0.67010897397995, "learning_rate": 0.0002, "epoch": 3.6937315944467817, "step": 4390}, {"loss": 1.5381, "grad_norm": 0.6139186024665833, "learning_rate": 0.0002, "epoch": 3.70214556163231, "step": 4400}, {"loss": 1.5088, "grad_norm": 0.5433071851730347, "learning_rate": 0.0002, "epoch": 3.7105595288178375, "step": 4410}, {"loss": 1.5337, "grad_norm": 0.5453870296478271, "learning_rate": 0.0002, "epoch": 3.7189734960033656, "step": 4420}, {"loss": 1.4549, "grad_norm": 0.6401727199554443, "learning_rate": 0.0002, "epoch": 3.7273874631888937, "step": 4430}, {"loss": 1.503, "grad_norm": 0.6049367189407349, "learning_rate": 0.0002, "epoch": 3.735801430374422, "step": 4440}, {"loss": 1.5268, "grad_norm": 0.5740529298782349, "learning_rate": 0.0002, "epoch": 3.7442153975599495, "step": 4450}, {"loss": 1.5183, "grad_norm": 0.6521880626678467, "learning_rate": 0.0002, "epoch": 3.7526293647454776, "step": 4460}, {"loss": 1.5741, "grad_norm": 0.7096368074417114, "learning_rate": 0.0002, "epoch": 3.7610433319310053, "step": 4470}, {"loss": 1.5786, "grad_norm": 0.5886474251747131, "learning_rate": 0.0002, "epoch": 3.7694572991165334, "step": 4480}, {"loss": 1.5887, "grad_norm": 0.5821043252944946, "learning_rate": 0.0002, "epoch": 3.7778712663020615, "step": 4490}, {"loss": 1.5777, "grad_norm": 0.628892183303833, "learning_rate": 0.0002, "epoch": 3.7862852334875896, "step": 4500}, {"loss": 1.4708, "grad_norm": 0.5962669849395752, "learning_rate": 0.0002, "epoch": 3.7946992006731173, "step": 4510}, {"loss": 1.5267, "grad_norm": 0.6635549068450928, "learning_rate": 0.0002, "epoch": 3.8031131678586454, "step": 4520}, {"loss": 1.5058, "grad_norm": 0.6010760068893433, "learning_rate": 0.0002, "epoch": 3.811527135044173, "step": 4530}, {"loss": 1.6228, "grad_norm": 0.6322658658027649, "learning_rate": 0.0002, "epoch": 3.819941102229701, "step": 4540}, {"loss": 1.5029, "grad_norm": 0.5893137454986572, "learning_rate": 0.0002, "epoch": 3.8283550694152293, "step": 4550}, {"loss": 1.5435, "grad_norm": 0.7829602360725403, "learning_rate": 0.0002, "epoch": 3.8367690366007574, "step": 4560}, {"loss": 1.5453, "grad_norm": 0.6190396547317505, "learning_rate": 0.0002, "epoch": 3.845183003786285, "step": 4570}, {"loss": 1.5292, "grad_norm": 0.6662813425064087, "learning_rate": 0.0002, "epoch": 3.853596970971813, "step": 4580}, {"loss": 1.5065, "grad_norm": 0.5809855461120605, "learning_rate": 0.0002, "epoch": 3.8620109381573413, "step": 4590}, {"loss": 1.5041, "grad_norm": 0.5779069662094116, "learning_rate": 0.0002, "epoch": 3.870424905342869, "step": 4600}, {"loss": 1.498, "grad_norm": 0.5603038668632507, "learning_rate": 0.0002, "epoch": 3.878838872528397, "step": 4610}, {"loss": 1.5372, "grad_norm": 0.6274181008338928, "learning_rate": 0.0002, "epoch": 3.887252839713925, "step": 4620}, {"loss": 1.4996, "grad_norm": 0.6810959577560425, "learning_rate": 0.0002, "epoch": 3.8956668068994533, "step": 4630}, {"loss": 1.4956, "grad_norm": 0.5647315979003906, "learning_rate": 0.0002, "epoch": 3.904080774084981, "step": 4640}, {"loss": 1.5424, "grad_norm": 0.6830295324325562, "learning_rate": 0.0002, "epoch": 3.912494741270509, "step": 4650}, {"loss": 1.535, "grad_norm": 0.652565598487854, "learning_rate": 0.0002, "epoch": 3.920908708456037, "step": 4660}, {"loss": 1.4772, "grad_norm": 0.5806284546852112, "learning_rate": 0.0002, "epoch": 3.929322675641565, "step": 4670}, {"loss": 1.5812, "grad_norm": 0.6825073957443237, "learning_rate": 0.0002, "epoch": 3.937736642827093, "step": 4680}, {"loss": 1.5516, "grad_norm": 0.6149451732635498, "learning_rate": 0.0002, "epoch": 3.946150610012621, "step": 4690}, {"loss": 1.5608, "grad_norm": 0.6152557134628296, "learning_rate": 0.0002, "epoch": 3.954564577198149, "step": 4700}, {"loss": 1.4897, "grad_norm": 0.6239011883735657, "learning_rate": 0.0002, "epoch": 3.962978544383677, "step": 4710}, {"loss": 1.538, "grad_norm": 0.6485443115234375, "learning_rate": 0.0002, "epoch": 3.971392511569205, "step": 4720}, {"loss": 1.5226, "grad_norm": 0.6449228525161743, "learning_rate": 0.0002, "epoch": 3.9798064787547327, "step": 4730}, {"loss": 1.5087, "grad_norm": 0.6526407599449158, "learning_rate": 0.0002, "epoch": 3.988220445940261, "step": 4740}, {"loss": 1.5026, "grad_norm": 0.6277706027030945, "learning_rate": 0.0002, "epoch": 3.996634413125789, "step": 4750}]} +{"epoch": 4.999579301640724, "step": 5942, "epoch_duration": 1272.9212999343872, "total_accumulated_duration": 7310.895057439804, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}, {"eval_loss": 1.8266887664794922, "eval_runtime": 37.9445, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.9995793016407237, "step": 3565}, {"loss": 1.5939, "grad_norm": 0.4523724615573883, "learning_rate": 0.0002, "epoch": 3.003786285233488, "step": 3570}, {"loss": 1.526, "grad_norm": 0.5261380076408386, "learning_rate": 0.0002, "epoch": 3.0122002524190155, "step": 3580}, {"loss": 1.4946, "grad_norm": 0.48664888739585876, "learning_rate": 0.0002, "epoch": 3.0206142196045436, "step": 3590}, {"loss": 1.5193, "grad_norm": 0.5070882439613342, "learning_rate": 0.0002, "epoch": 3.0290281867900717, "step": 3600}, {"loss": 1.5316, "grad_norm": 0.5816011428833008, "learning_rate": 0.0002, "epoch": 3.0374421539755994, "step": 3610}, {"loss": 1.5682, "grad_norm": 0.6610211730003357, "learning_rate": 0.0002, "epoch": 3.0458561211611275, "step": 3620}, {"loss": 1.5699, "grad_norm": 0.5257703065872192, "learning_rate": 0.0002, "epoch": 3.0542700883466556, "step": 3630}, {"loss": 1.4438, "grad_norm": 0.5574390888214111, "learning_rate": 0.0002, "epoch": 3.0626840555321833, "step": 3640}, {"loss": 1.547, "grad_norm": 0.5682297348976135, "learning_rate": 0.0002, "epoch": 3.0710980227177114, "step": 3650}, {"loss": 1.5743, "grad_norm": 0.5798383355140686, "learning_rate": 0.0002, "epoch": 3.0795119899032395, "step": 3660}, {"loss": 1.4339, "grad_norm": 0.5458289980888367, "learning_rate": 0.0002, "epoch": 3.087925957088767, "step": 3670}, {"loss": 1.46, "grad_norm": 0.5599102973937988, "learning_rate": 0.0002, "epoch": 3.0963399242742953, "step": 3680}, {"loss": 1.4589, "grad_norm": 0.5023021697998047, "learning_rate": 0.0002, "epoch": 3.1047538914598234, "step": 3690}, {"loss": 1.5114, "grad_norm": 0.5448206067085266, "learning_rate": 0.0002, "epoch": 3.113167858645351, "step": 3700}, {"loss": 1.4692, "grad_norm": 0.5760458707809448, "learning_rate": 0.0002, "epoch": 3.121581825830879, "step": 3710}, {"loss": 1.4789, "grad_norm": 0.6018968224525452, "learning_rate": 0.0002, "epoch": 3.1299957930164073, "step": 3720}, {"loss": 1.5518, "grad_norm": 0.5767101049423218, "learning_rate": 0.0002, "epoch": 3.1384097602019354, "step": 3730}, {"loss": 1.5032, "grad_norm": 0.5333963632583618, "learning_rate": 0.0002, "epoch": 3.146823727387463, "step": 3740}, {"loss": 1.4812, "grad_norm": 0.5918396711349487, "learning_rate": 0.0002, "epoch": 3.155237694572991, "step": 3750}, {"loss": 1.4618, "grad_norm": 0.5931203365325928, "learning_rate": 0.0002, "epoch": 3.1636516617585193, "step": 3760}, {"loss": 1.5592, "grad_norm": 0.6562168598175049, "learning_rate": 0.0002, "epoch": 3.172065628944047, "step": 3770}, {"loss": 1.4932, "grad_norm": 0.5820156335830688, "learning_rate": 0.0002, "epoch": 3.180479596129575, "step": 3780}, {"loss": 1.4523, "grad_norm": 0.5784737467765808, "learning_rate": 0.0002, "epoch": 3.188893563315103, "step": 3790}, {"loss": 1.498, "grad_norm": 0.5506529808044434, "learning_rate": 0.0002, "epoch": 3.197307530500631, "step": 3800}, {"loss": 1.4819, "grad_norm": 0.6101595163345337, "learning_rate": 0.0002, "epoch": 3.205721497686159, "step": 3810}, {"loss": 1.5185, "grad_norm": 0.5597806572914124, "learning_rate": 0.0002, "epoch": 3.214135464871687, "step": 3820}, {"loss": 1.5664, "grad_norm": 0.5641011595726013, "learning_rate": 0.0002, "epoch": 3.222549432057215, "step": 3830}, {"loss": 1.4702, "grad_norm": 0.5892080068588257, "learning_rate": 0.0002, "epoch": 3.230963399242743, "step": 3840}, {"loss": 1.4194, "grad_norm": 0.6034760475158691, "learning_rate": 0.0002, "epoch": 3.239377366428271, "step": 3850}, {"loss": 1.5499, "grad_norm": 0.5112439393997192, "learning_rate": 0.0002, "epoch": 3.247791333613799, "step": 3860}, {"loss": 1.5132, "grad_norm": 0.56565922498703, "learning_rate": 0.0002, "epoch": 3.256205300799327, "step": 3870}, {"loss": 1.4892, "grad_norm": 0.6155247092247009, "learning_rate": 0.0002, "epoch": 3.264619267984855, "step": 3880}, {"loss": 1.5118, "grad_norm": 0.6064623594284058, "learning_rate": 0.0002, "epoch": 3.273033235170383, "step": 3890}, {"loss": 1.5236, "grad_norm": 0.6313768029212952, "learning_rate": 0.0002, "epoch": 3.2814472023559107, "step": 3900}, {"loss": 1.5551, "grad_norm": 0.5903939008712769, "learning_rate": 0.0002, "epoch": 3.289861169541439, "step": 3910}, {"loss": 1.5703, "grad_norm": 0.5770667195320129, "learning_rate": 0.0002, "epoch": 3.298275136726967, "step": 3920}, {"loss": 1.5159, "grad_norm": 0.5785196423530579, "learning_rate": 0.0002, "epoch": 3.3066891039124946, "step": 3930}, {"loss": 1.5277, "grad_norm": 0.6468310356140137, "learning_rate": 0.0002, "epoch": 3.3151030710980227, "step": 3940}, {"loss": 1.6002, "grad_norm": 0.6200279593467712, "learning_rate": 0.0002, "epoch": 3.323517038283551, "step": 3950}, {"loss": 1.5264, "grad_norm": 0.5779302716255188, "learning_rate": 0.0002, "epoch": 3.3319310054690785, "step": 3960}, {"loss": 1.4861, "grad_norm": 0.5463796854019165, "learning_rate": 0.0002, "epoch": 3.3403449726546066, "step": 3970}, {"loss": 1.541, "grad_norm": 0.6117855906486511, "learning_rate": 0.0002, "epoch": 3.3487589398401347, "step": 3980}, {"loss": 1.5566, "grad_norm": 0.5554766058921814, "learning_rate": 0.0002, "epoch": 3.357172907025663, "step": 3990}, {"loss": 1.5004, "grad_norm": 0.6012870073318481, "learning_rate": 0.0002, "epoch": 3.3655868742111905, "step": 4000}, {"loss": 1.473, "grad_norm": 0.5443974137306213, "learning_rate": 0.0002, "epoch": 3.3740008413967186, "step": 4010}, {"loss": 1.5139, "grad_norm": 0.6636057496070862, "learning_rate": 0.0002, "epoch": 3.3824148085822463, "step": 4020}, {"loss": 1.5141, "grad_norm": 0.5801246166229248, "learning_rate": 0.0002, "epoch": 3.3908287757677744, "step": 4030}, {"loss": 1.5026, "grad_norm": 0.5668839812278748, "learning_rate": 0.0002, "epoch": 3.3992427429533025, "step": 4040}, {"loss": 1.523, "grad_norm": 0.7763481736183167, "learning_rate": 0.0002, "epoch": 3.4076567101388306, "step": 4050}, {"loss": 1.4932, "grad_norm": 0.6675992608070374, "learning_rate": 0.0002, "epoch": 3.4160706773243583, "step": 4060}, {"loss": 1.4959, "grad_norm": 0.6290077567100525, "learning_rate": 0.0002, "epoch": 3.4244846445098864, "step": 4070}, {"loss": 1.5766, "grad_norm": 0.6040239930152893, "learning_rate": 0.0002, "epoch": 3.4328986116954145, "step": 4080}, {"loss": 1.5711, "grad_norm": 0.6237877607345581, "learning_rate": 0.0002, "epoch": 3.441312578880942, "step": 4090}, {"loss": 1.4961, "grad_norm": 0.5343508124351501, "learning_rate": 0.0002, "epoch": 3.4497265460664703, "step": 4100}, {"loss": 1.5123, "grad_norm": 0.6817412972450256, "learning_rate": 0.0002, "epoch": 3.4581405132519984, "step": 4110}, {"loss": 1.5377, "grad_norm": 0.7115170359611511, "learning_rate": 0.0002, "epoch": 3.466554480437526, "step": 4120}, {"loss": 1.5275, "grad_norm": 0.6127332448959351, "learning_rate": 0.0002, "epoch": 3.4749684476230542, "step": 4130}, {"loss": 1.557, "grad_norm": 0.5745994448661804, "learning_rate": 0.0002, "epoch": 3.4833824148085824, "step": 4140}, {"loss": 1.4873, "grad_norm": 0.6248795390129089, "learning_rate": 0.0002, "epoch": 3.49179638199411, "step": 4150}, {"loss": 1.4885, "grad_norm": 0.5821124911308289, "learning_rate": 0.0002, "epoch": 3.500210349179638, "step": 4160}, {"loss": 1.4937, "grad_norm": 0.561416506767273, "learning_rate": 0.0002, "epoch": 3.5086243163651663, "step": 4170}, {"loss": 1.5453, "grad_norm": 0.5848962664604187, "learning_rate": 0.0002, "epoch": 3.5170382835506944, "step": 4180}, {"loss": 1.5892, "grad_norm": 0.5335569977760315, "learning_rate": 0.0002, "epoch": 3.525452250736222, "step": 4190}, {"loss": 1.5152, "grad_norm": 0.547964870929718, "learning_rate": 0.0002, "epoch": 3.53386621792175, "step": 4200}, {"loss": 1.4887, "grad_norm": 0.6157727241516113, "learning_rate": 0.0002, "epoch": 3.542280185107278, "step": 4210}, {"loss": 1.5484, "grad_norm": 0.6163121461868286, "learning_rate": 0.0002, "epoch": 3.550694152292806, "step": 4220}, {"loss": 1.5833, "grad_norm": 0.5844616293907166, "learning_rate": 0.0002, "epoch": 3.559108119478334, "step": 4230}, {"loss": 1.5305, "grad_norm": 0.7104926109313965, "learning_rate": 0.0002, "epoch": 3.567522086663862, "step": 4240}, {"loss": 1.5161, "grad_norm": 0.5055213570594788, "learning_rate": 0.0002, "epoch": 3.57593605384939, "step": 4250}, {"loss": 1.482, "grad_norm": 0.611676812171936, "learning_rate": 0.0002, "epoch": 3.584350021034918, "step": 4260}, {"loss": 1.5048, "grad_norm": 0.6326440572738647, "learning_rate": 0.0002, "epoch": 3.592763988220446, "step": 4270}, {"loss": 1.5122, "grad_norm": 0.6290925741195679, "learning_rate": 0.0002, "epoch": 3.6011779554059737, "step": 4280}, {"loss": 1.5654, "grad_norm": 0.5691978931427002, "learning_rate": 0.0002, "epoch": 3.609591922591502, "step": 4290}, {"loss": 1.4854, "grad_norm": 0.6071329116821289, "learning_rate": 0.0002, "epoch": 3.61800588977703, "step": 4300}, {"loss": 1.5336, "grad_norm": 0.606573224067688, "learning_rate": 0.0002, "epoch": 3.626419856962558, "step": 4310}, {"loss": 1.6437, "grad_norm": 0.5515419244766235, "learning_rate": 0.0002, "epoch": 3.6348338241480858, "step": 4320}, {"loss": 1.498, "grad_norm": 0.5964660048484802, "learning_rate": 0.0002, "epoch": 3.643247791333614, "step": 4330}, {"loss": 1.544, "grad_norm": 0.5774146914482117, "learning_rate": 0.0002, "epoch": 3.6516617585191415, "step": 4340}, {"loss": 1.5566, "grad_norm": 0.5732731223106384, "learning_rate": 0.0002, "epoch": 3.6600757257046697, "step": 4350}, {"loss": 1.5682, "grad_norm": 0.7354163527488708, "learning_rate": 0.0002, "epoch": 3.6684896928901978, "step": 4360}, {"loss": 1.5225, "grad_norm": 0.6220902800559998, "learning_rate": 0.0002, "epoch": 3.676903660075726, "step": 4370}, {"loss": 1.4838, "grad_norm": 0.6053991317749023, "learning_rate": 0.0002, "epoch": 3.6853176272612536, "step": 4380}, {"loss": 1.5161, "grad_norm": 0.67010897397995, "learning_rate": 0.0002, "epoch": 3.6937315944467817, "step": 4390}, {"loss": 1.5381, "grad_norm": 0.6139186024665833, "learning_rate": 0.0002, "epoch": 3.70214556163231, "step": 4400}, {"loss": 1.5088, "grad_norm": 0.5433071851730347, "learning_rate": 0.0002, "epoch": 3.7105595288178375, "step": 4410}, {"loss": 1.5337, "grad_norm": 0.5453870296478271, "learning_rate": 0.0002, "epoch": 3.7189734960033656, "step": 4420}, {"loss": 1.4549, "grad_norm": 0.6401727199554443, "learning_rate": 0.0002, "epoch": 3.7273874631888937, "step": 4430}, {"loss": 1.503, "grad_norm": 0.6049367189407349, "learning_rate": 0.0002, "epoch": 3.735801430374422, "step": 4440}, {"loss": 1.5268, "grad_norm": 0.5740529298782349, "learning_rate": 0.0002, "epoch": 3.7442153975599495, "step": 4450}, {"loss": 1.5183, "grad_norm": 0.6521880626678467, "learning_rate": 0.0002, "epoch": 3.7526293647454776, "step": 4460}, {"loss": 1.5741, "grad_norm": 0.7096368074417114, "learning_rate": 0.0002, "epoch": 3.7610433319310053, "step": 4470}, {"loss": 1.5786, "grad_norm": 0.5886474251747131, "learning_rate": 0.0002, "epoch": 3.7694572991165334, "step": 4480}, {"loss": 1.5887, "grad_norm": 0.5821043252944946, "learning_rate": 0.0002, "epoch": 3.7778712663020615, "step": 4490}, {"loss": 1.5777, "grad_norm": 0.628892183303833, "learning_rate": 0.0002, "epoch": 3.7862852334875896, "step": 4500}, {"loss": 1.4708, "grad_norm": 0.5962669849395752, "learning_rate": 0.0002, "epoch": 3.7946992006731173, "step": 4510}, {"loss": 1.5267, "grad_norm": 0.6635549068450928, "learning_rate": 0.0002, "epoch": 3.8031131678586454, "step": 4520}, {"loss": 1.5058, "grad_norm": 0.6010760068893433, "learning_rate": 0.0002, "epoch": 3.811527135044173, "step": 4530}, {"loss": 1.6228, "grad_norm": 0.6322658658027649, "learning_rate": 0.0002, "epoch": 3.819941102229701, "step": 4540}, {"loss": 1.5029, "grad_norm": 0.5893137454986572, "learning_rate": 0.0002, "epoch": 3.8283550694152293, "step": 4550}, {"loss": 1.5435, "grad_norm": 0.7829602360725403, "learning_rate": 0.0002, "epoch": 3.8367690366007574, "step": 4560}, {"loss": 1.5453, "grad_norm": 0.6190396547317505, "learning_rate": 0.0002, "epoch": 3.845183003786285, "step": 4570}, {"loss": 1.5292, "grad_norm": 0.6662813425064087, "learning_rate": 0.0002, "epoch": 3.853596970971813, "step": 4580}, {"loss": 1.5065, "grad_norm": 0.5809855461120605, "learning_rate": 0.0002, "epoch": 3.8620109381573413, "step": 4590}, {"loss": 1.5041, "grad_norm": 0.5779069662094116, "learning_rate": 0.0002, "epoch": 3.870424905342869, "step": 4600}, {"loss": 1.498, "grad_norm": 0.5603038668632507, "learning_rate": 0.0002, "epoch": 3.878838872528397, "step": 4610}, {"loss": 1.5372, "grad_norm": 0.6274181008338928, "learning_rate": 0.0002, "epoch": 3.887252839713925, "step": 4620}, {"loss": 1.4996, "grad_norm": 0.6810959577560425, "learning_rate": 0.0002, "epoch": 3.8956668068994533, "step": 4630}, {"loss": 1.4956, "grad_norm": 0.5647315979003906, "learning_rate": 0.0002, "epoch": 3.904080774084981, "step": 4640}, {"loss": 1.5424, "grad_norm": 0.6830295324325562, "learning_rate": 0.0002, "epoch": 3.912494741270509, "step": 4650}, {"loss": 1.535, "grad_norm": 0.652565598487854, "learning_rate": 0.0002, "epoch": 3.920908708456037, "step": 4660}, {"loss": 1.4772, "grad_norm": 0.5806284546852112, "learning_rate": 0.0002, "epoch": 3.929322675641565, "step": 4670}, {"loss": 1.5812, "grad_norm": 0.6825073957443237, "learning_rate": 0.0002, "epoch": 3.937736642827093, "step": 4680}, {"loss": 1.5516, "grad_norm": 0.6149451732635498, "learning_rate": 0.0002, "epoch": 3.946150610012621, "step": 4690}, {"loss": 1.5608, "grad_norm": 0.6152557134628296, "learning_rate": 0.0002, "epoch": 3.954564577198149, "step": 4700}, {"loss": 1.4897, "grad_norm": 0.6239011883735657, "learning_rate": 0.0002, "epoch": 3.962978544383677, "step": 4710}, {"loss": 1.538, "grad_norm": 0.6485443115234375, "learning_rate": 0.0002, "epoch": 3.971392511569205, "step": 4720}, {"loss": 1.5226, "grad_norm": 0.6449228525161743, "learning_rate": 0.0002, "epoch": 3.9798064787547327, "step": 4730}, {"loss": 1.5087, "grad_norm": 0.6526407599449158, "learning_rate": 0.0002, "epoch": 3.988220445940261, "step": 4740}, {"loss": 1.5026, "grad_norm": 0.6277706027030945, "learning_rate": 0.0002, "epoch": 3.996634413125789, "step": 4750}, {"eval_loss": 1.871641755104065, "eval_runtime": 37.9637, "eval_samples_per_second": 13.566, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4754}, {"loss": 1.4744, "grad_norm": 0.6994837522506714, "learning_rate": 0.0002, "epoch": 4.005048380311317, "step": 4760}, {"loss": 1.4433, "grad_norm": 0.8728373050689697, "learning_rate": 0.0002, "epoch": 4.013462347496845, "step": 4770}, {"loss": 1.3329, "grad_norm": 0.688679575920105, "learning_rate": 0.0002, "epoch": 4.021876314682372, "step": 4780}, {"loss": 1.3999, "grad_norm": 0.6313387155532837, "learning_rate": 0.0002, "epoch": 4.0302902818679005, "step": 4790}, {"loss": 1.3346, "grad_norm": 0.6577984690666199, "learning_rate": 0.0002, "epoch": 4.038704249053429, "step": 4800}, {"loss": 1.3403, "grad_norm": 0.7938185930252075, "learning_rate": 0.0002, "epoch": 4.047118216238957, "step": 4810}, {"loss": 1.3716, "grad_norm": 0.760399580001831, "learning_rate": 0.0002, "epoch": 4.055532183424485, "step": 4820}, {"loss": 1.4321, "grad_norm": 0.7329602241516113, "learning_rate": 0.0002, "epoch": 4.063946150610013, "step": 4830}, {"loss": 1.4133, "grad_norm": 0.7778576016426086, "learning_rate": 0.0002, "epoch": 4.07236011779554, "step": 4840}, {"loss": 1.4372, "grad_norm": 0.8235865235328674, "learning_rate": 0.0002, "epoch": 4.080774084981068, "step": 4850}, {"loss": 1.3719, "grad_norm": 0.7743754386901855, "learning_rate": 0.0002, "epoch": 4.089188052166596, "step": 4860}, {"loss": 1.3787, "grad_norm": 0.8145367503166199, "learning_rate": 0.0002, "epoch": 4.0976020193521245, "step": 4870}, {"loss": 1.356, "grad_norm": 0.8517307639122009, "learning_rate": 0.0002, "epoch": 4.106015986537653, "step": 4880}, {"loss": 1.4191, "grad_norm": 0.8208953142166138, "learning_rate": 0.0002, "epoch": 4.114429953723181, "step": 4890}, {"loss": 1.3189, "grad_norm": 0.8437790870666504, "learning_rate": 0.0002, "epoch": 4.122843920908709, "step": 4900}, {"loss": 1.3987, "grad_norm": 0.716672420501709, "learning_rate": 0.0002, "epoch": 4.131257888094236, "step": 4910}, {"loss": 1.4392, "grad_norm": 0.7656235098838806, "learning_rate": 0.0002, "epoch": 4.139671855279764, "step": 4920}, {"loss": 1.3408, "grad_norm": 0.7209306955337524, "learning_rate": 0.0002, "epoch": 4.148085822465292, "step": 4930}, {"loss": 1.3639, "grad_norm": 0.7731267809867859, "learning_rate": 0.0002, "epoch": 4.1564997896508205, "step": 4940}, {"loss": 1.4151, "grad_norm": 0.7477553486824036, "learning_rate": 0.0002, "epoch": 4.164913756836349, "step": 4950}, {"loss": 1.3485, "grad_norm": 0.7372981309890747, "learning_rate": 0.0002, "epoch": 4.173327724021877, "step": 4960}, {"loss": 1.3901, "grad_norm": 0.6582154035568237, "learning_rate": 0.0002, "epoch": 4.181741691207404, "step": 4970}, {"loss": 1.3343, "grad_norm": 0.7003206610679626, "learning_rate": 0.0002, "epoch": 4.190155658392932, "step": 4980}, {"loss": 1.4098, "grad_norm": 0.735223650932312, "learning_rate": 0.0002, "epoch": 4.19856962557846, "step": 4990}, {"loss": 1.3564, "grad_norm": 0.7832302451133728, "learning_rate": 0.0002, "epoch": 4.206983592763988, "step": 5000}, {"loss": 1.3622, "grad_norm": 0.8819546103477478, "learning_rate": 0.0002, "epoch": 4.215397559949516, "step": 5010}, {"loss": 1.4438, "grad_norm": 0.9325336813926697, "learning_rate": 0.0002, "epoch": 4.2238115271350445, "step": 5020}, {"loss": 1.3886, "grad_norm": 0.7007517218589783, "learning_rate": 0.0002, "epoch": 4.232225494320572, "step": 5030}, {"loss": 1.3683, "grad_norm": 0.7118321061134338, "learning_rate": 0.0002, "epoch": 4.2406394615061, "step": 5040}, {"loss": 1.2365, "grad_norm": 0.6578946709632874, "learning_rate": 0.0002, "epoch": 4.249053428691628, "step": 5050}, {"loss": 1.3696, "grad_norm": 0.9438983798027039, "learning_rate": 0.0002, "epoch": 4.257467395877156, "step": 5060}, {"loss": 1.3868, "grad_norm": 0.703037679195404, "learning_rate": 0.0002, "epoch": 4.265881363062684, "step": 5070}, {"loss": 1.3687, "grad_norm": 0.7286025285720825, "learning_rate": 0.0002, "epoch": 4.274295330248212, "step": 5080}, {"loss": 1.3605, "grad_norm": 0.750689685344696, "learning_rate": 0.0002, "epoch": 4.28270929743374, "step": 5090}, {"loss": 1.5089, "grad_norm": 0.869753360748291, "learning_rate": 0.0002, "epoch": 4.291123264619268, "step": 5100}, {"loss": 1.4128, "grad_norm": 0.8712980151176453, "learning_rate": 0.0002, "epoch": 4.299537231804796, "step": 5110}, {"loss": 1.3977, "grad_norm": 0.690263569355011, "learning_rate": 0.0002, "epoch": 4.307951198990324, "step": 5120}, {"loss": 1.4088, "grad_norm": 0.7114760279655457, "learning_rate": 0.0002, "epoch": 4.316365166175852, "step": 5130}, {"loss": 1.363, "grad_norm": 0.7588112354278564, "learning_rate": 0.0002, "epoch": 4.32477913336138, "step": 5140}, {"loss": 1.4408, "grad_norm": 0.7556202411651611, "learning_rate": 0.0002, "epoch": 4.333193100546908, "step": 5150}, {"loss": 1.4203, "grad_norm": 0.8357610702514648, "learning_rate": 0.0002, "epoch": 4.341607067732435, "step": 5160}, {"loss": 1.3348, "grad_norm": 0.8054035902023315, "learning_rate": 0.0002, "epoch": 4.3500210349179635, "step": 5170}, {"loss": 1.3109, "grad_norm": 0.7637107968330383, "learning_rate": 0.0002, "epoch": 4.358435002103492, "step": 5180}, {"loss": 1.3744, "grad_norm": 0.757481038570404, "learning_rate": 0.0002, "epoch": 4.36684896928902, "step": 5190}, {"loss": 1.3622, "grad_norm": 0.7185863852500916, "learning_rate": 0.0002, "epoch": 4.375262936474548, "step": 5200}, {"loss": 1.3896, "grad_norm": 0.7326455116271973, "learning_rate": 0.0002, "epoch": 4.383676903660076, "step": 5210}, {"loss": 1.4098, "grad_norm": 0.7980523109436035, "learning_rate": 0.0002, "epoch": 4.392090870845603, "step": 5220}, {"loss": 1.3783, "grad_norm": 0.8526999354362488, "learning_rate": 0.0002, "epoch": 4.400504838031131, "step": 5230}, {"loss": 1.4022, "grad_norm": 0.7012337446212769, "learning_rate": 0.0002, "epoch": 4.4089188052166595, "step": 5240}, {"loss": 1.3552, "grad_norm": 0.8217827677726746, "learning_rate": 0.0002, "epoch": 4.417332772402188, "step": 5250}, {"loss": 1.3482, "grad_norm": 0.7141005396842957, "learning_rate": 0.0002, "epoch": 4.425746739587716, "step": 5260}, {"loss": 1.3699, "grad_norm": 0.7094302177429199, "learning_rate": 0.0002, "epoch": 4.434160706773244, "step": 5270}, {"loss": 1.3527, "grad_norm": 0.7234613299369812, "learning_rate": 0.0002, "epoch": 4.442574673958772, "step": 5280}, {"loss": 1.4769, "grad_norm": 0.7530457973480225, "learning_rate": 0.0002, "epoch": 4.450988641144299, "step": 5290}, {"loss": 1.3944, "grad_norm": 0.7300912141799927, "learning_rate": 0.0002, "epoch": 4.459402608329827, "step": 5300}, {"loss": 1.3844, "grad_norm": 0.825443685054779, "learning_rate": 0.0002, "epoch": 4.467816575515355, "step": 5310}, {"loss": 1.3648, "grad_norm": 0.7559658885002136, "learning_rate": 0.0002, "epoch": 4.4762305427008835, "step": 5320}, {"loss": 1.4364, "grad_norm": 0.8817561268806458, "learning_rate": 0.0002, "epoch": 4.484644509886412, "step": 5330}, {"loss": 1.3618, "grad_norm": 0.8203575611114502, "learning_rate": 0.0002, "epoch": 4.49305847707194, "step": 5340}, {"loss": 1.3996, "grad_norm": 0.7677690982818604, "learning_rate": 0.0002, "epoch": 4.501472444257468, "step": 5350}, {"loss": 1.4142, "grad_norm": 0.657085120677948, "learning_rate": 0.0002, "epoch": 4.509886411442995, "step": 5360}, {"loss": 1.3722, "grad_norm": 0.7939504384994507, "learning_rate": 0.0002, "epoch": 4.518300378628523, "step": 5370}, {"loss": 1.4361, "grad_norm": 0.6971889138221741, "learning_rate": 0.0002, "epoch": 4.526714345814051, "step": 5380}, {"loss": 1.3637, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 4.535128312999579, "step": 5390}, {"loss": 1.341, "grad_norm": 0.8504858613014221, "learning_rate": 0.0002, "epoch": 4.5435422801851075, "step": 5400}, {"loss": 1.4026, "grad_norm": 0.9134073853492737, "learning_rate": 0.0002, "epoch": 4.551956247370635, "step": 5410}, {"loss": 1.4375, "grad_norm": 0.7765598893165588, "learning_rate": 0.0002, "epoch": 4.560370214556163, "step": 5420}, {"loss": 1.4832, "grad_norm": 0.6991009712219238, "learning_rate": 0.0002, "epoch": 4.568784181741691, "step": 5430}, {"loss": 1.4021, "grad_norm": 0.8393039107322693, "learning_rate": 0.0002, "epoch": 4.577198148927219, "step": 5440}, {"loss": 1.3976, "grad_norm": 0.7685918211936951, "learning_rate": 0.0002, "epoch": 4.585612116112747, "step": 5450}, {"loss": 1.3883, "grad_norm": 0.7135679721832275, "learning_rate": 0.0002, "epoch": 4.594026083298275, "step": 5460}, {"loss": 1.4083, "grad_norm": 0.6728870868682861, "learning_rate": 0.0002, "epoch": 4.6024400504838034, "step": 5470}, {"loss": 1.3698, "grad_norm": 0.7139479517936707, "learning_rate": 0.0002, "epoch": 4.610854017669331, "step": 5480}, {"loss": 1.3498, "grad_norm": 0.8476598858833313, "learning_rate": 0.0002, "epoch": 4.619267984854859, "step": 5490}, {"loss": 1.3389, "grad_norm": 0.8034361004829407, "learning_rate": 0.0002, "epoch": 4.627681952040387, "step": 5500}, {"loss": 1.4179, "grad_norm": 0.7452183961868286, "learning_rate": 0.0002, "epoch": 4.636095919225915, "step": 5510}, {"loss": 1.4031, "grad_norm": 0.8394148945808411, "learning_rate": 0.0002, "epoch": 4.644509886411443, "step": 5520}, {"loss": 1.4561, "grad_norm": 0.7480153441429138, "learning_rate": 0.0002, "epoch": 4.652923853596971, "step": 5530}, {"loss": 1.378, "grad_norm": 0.7781714797019958, "learning_rate": 0.0002, "epoch": 4.661337820782499, "step": 5540}, {"loss": 1.3924, "grad_norm": 1.0058213472366333, "learning_rate": 0.0002, "epoch": 4.669751787968027, "step": 5550}, {"loss": 1.4198, "grad_norm": 0.7403179407119751, "learning_rate": 0.0002, "epoch": 4.678165755153555, "step": 5560}, {"loss": 1.4328, "grad_norm": 0.7270476818084717, "learning_rate": 0.0002, "epoch": 4.686579722339083, "step": 5570}, {"loss": 1.378, "grad_norm": 0.760877788066864, "learning_rate": 0.0002, "epoch": 4.694993689524611, "step": 5580}, {"loss": 1.387, "grad_norm": 0.8097004890441895, "learning_rate": 0.0002, "epoch": 4.703407656710139, "step": 5590}, {"loss": 1.3661, "grad_norm": 0.9096523523330688, "learning_rate": 0.0002, "epoch": 4.711821623895667, "step": 5600}, {"loss": 1.4012, "grad_norm": 0.7262444496154785, "learning_rate": 0.0002, "epoch": 4.720235591081195, "step": 5610}, {"loss": 1.422, "grad_norm": 0.8207762837409973, "learning_rate": 0.0002, "epoch": 4.7286495582667225, "step": 5620}, {"loss": 1.4017, "grad_norm": 0.8089601993560791, "learning_rate": 0.0002, "epoch": 4.737063525452251, "step": 5630}, {"loss": 1.3675, "grad_norm": 0.7609543800354004, "learning_rate": 0.0002, "epoch": 4.745477492637779, "step": 5640}, {"loss": 1.4085, "grad_norm": 0.7273501753807068, "learning_rate": 0.0002, "epoch": 4.753891459823307, "step": 5650}, {"loss": 1.3849, "grad_norm": 0.7800219058990479, "learning_rate": 0.0002, "epoch": 4.762305427008835, "step": 5660}, {"loss": 1.4319, "grad_norm": 0.8558377623558044, "learning_rate": 0.0002, "epoch": 4.770719394194362, "step": 5670}, {"loss": 1.3831, "grad_norm": 0.7131547927856445, "learning_rate": 0.0002, "epoch": 4.77913336137989, "step": 5680}, {"loss": 1.407, "grad_norm": 0.7651025056838989, "learning_rate": 0.0002, "epoch": 4.787547328565418, "step": 5690}, {"loss": 1.3882, "grad_norm": 0.8129976391792297, "learning_rate": 0.0002, "epoch": 4.7959612957509465, "step": 5700}, {"loss": 1.4347, "grad_norm": 0.8019895553588867, "learning_rate": 0.0002, "epoch": 4.804375262936475, "step": 5710}, {"loss": 1.3961, "grad_norm": 0.7692018151283264, "learning_rate": 0.0002, "epoch": 4.812789230122003, "step": 5720}, {"loss": 1.419, "grad_norm": 0.6893943548202515, "learning_rate": 0.0002, "epoch": 4.821203197307531, "step": 5730}, {"loss": 1.4453, "grad_norm": 0.6881810426712036, "learning_rate": 0.0002, "epoch": 4.829617164493058, "step": 5740}, {"loss": 1.4775, "grad_norm": 0.7838267683982849, "learning_rate": 0.0002, "epoch": 4.838031131678586, "step": 5750}, {"loss": 1.3857, "grad_norm": 0.727799117565155, "learning_rate": 0.0002, "epoch": 4.846445098864114, "step": 5760}, {"loss": 1.4685, "grad_norm": 0.7458277344703674, "learning_rate": 0.0002, "epoch": 4.8548590660496425, "step": 5770}, {"loss": 1.4426, "grad_norm": 0.903802216053009, "learning_rate": 0.0002, "epoch": 4.863273033235171, "step": 5780}, {"loss": 1.451, "grad_norm": 0.7983472347259521, "learning_rate": 0.0002, "epoch": 4.871687000420699, "step": 5790}, {"loss": 1.4534, "grad_norm": 0.6894361972808838, "learning_rate": 0.0002, "epoch": 4.880100967606227, "step": 5800}, {"loss": 1.4486, "grad_norm": 0.7499409317970276, "learning_rate": 0.0002, "epoch": 4.888514934791754, "step": 5810}, {"loss": 1.4253, "grad_norm": 0.7362820506095886, "learning_rate": 0.0002, "epoch": 4.896928901977282, "step": 5820}, {"loss": 1.3763, "grad_norm": 0.8341619968414307, "learning_rate": 0.0002, "epoch": 4.90534286916281, "step": 5830}, {"loss": 1.3748, "grad_norm": 0.9604470133781433, "learning_rate": 0.0002, "epoch": 4.913756836348338, "step": 5840}, {"loss": 1.3658, "grad_norm": 0.8916844129562378, "learning_rate": 0.0002, "epoch": 4.9221708035338665, "step": 5850}, {"loss": 1.363, "grad_norm": 0.8519647121429443, "learning_rate": 0.0002, "epoch": 4.930584770719394, "step": 5860}, {"loss": 1.424, "grad_norm": 0.7946906089782715, "learning_rate": 0.0002, "epoch": 4.938998737904922, "step": 5870}, {"loss": 1.4071, "grad_norm": 0.7843789458274841, "learning_rate": 0.0002, "epoch": 4.94741270509045, "step": 5880}, {"loss": 1.4021, "grad_norm": 0.707618772983551, "learning_rate": 0.0002, "epoch": 4.955826672275978, "step": 5890}, {"loss": 1.502, "grad_norm": 0.7704206109046936, "learning_rate": 0.0002, "epoch": 4.964240639461506, "step": 5900}, {"loss": 1.4456, "grad_norm": 0.7160256505012512, "learning_rate": 0.0002, "epoch": 4.972654606647034, "step": 5910}, {"loss": 1.3874, "grad_norm": 0.7020420432090759, "learning_rate": 0.0002, "epoch": 4.981068573832562, "step": 5920}, {"loss": 1.4037, "grad_norm": 0.7576286792755127, "learning_rate": 0.0002, "epoch": 4.98948254101809, "step": 5930}, {"loss": 1.414, "grad_norm": 0.8573036789894104, "learning_rate": 0.0002, "epoch": 4.997896508203618, "step": 5940}]} +{"epoch": 6.0, "step": 7131, "epoch_duration": 1254.0791096687317, "total_accumulated_duration": 8564.974167108536, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}, {"eval_loss": 1.8266887664794922, "eval_runtime": 37.9445, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.9995793016407237, "step": 3565}, {"loss": 1.5939, "grad_norm": 0.4523724615573883, "learning_rate": 0.0002, "epoch": 3.003786285233488, "step": 3570}, {"loss": 1.526, "grad_norm": 0.5261380076408386, "learning_rate": 0.0002, "epoch": 3.0122002524190155, "step": 3580}, {"loss": 1.4946, "grad_norm": 0.48664888739585876, "learning_rate": 0.0002, "epoch": 3.0206142196045436, "step": 3590}, {"loss": 1.5193, "grad_norm": 0.5070882439613342, "learning_rate": 0.0002, "epoch": 3.0290281867900717, "step": 3600}, {"loss": 1.5316, "grad_norm": 0.5816011428833008, "learning_rate": 0.0002, "epoch": 3.0374421539755994, "step": 3610}, {"loss": 1.5682, "grad_norm": 0.6610211730003357, "learning_rate": 0.0002, "epoch": 3.0458561211611275, "step": 3620}, {"loss": 1.5699, "grad_norm": 0.5257703065872192, "learning_rate": 0.0002, "epoch": 3.0542700883466556, "step": 3630}, {"loss": 1.4438, "grad_norm": 0.5574390888214111, "learning_rate": 0.0002, "epoch": 3.0626840555321833, "step": 3640}, {"loss": 1.547, "grad_norm": 0.5682297348976135, "learning_rate": 0.0002, "epoch": 3.0710980227177114, "step": 3650}, {"loss": 1.5743, "grad_norm": 0.5798383355140686, "learning_rate": 0.0002, "epoch": 3.0795119899032395, "step": 3660}, {"loss": 1.4339, "grad_norm": 0.5458289980888367, "learning_rate": 0.0002, "epoch": 3.087925957088767, "step": 3670}, {"loss": 1.46, "grad_norm": 0.5599102973937988, "learning_rate": 0.0002, "epoch": 3.0963399242742953, "step": 3680}, {"loss": 1.4589, "grad_norm": 0.5023021697998047, "learning_rate": 0.0002, "epoch": 3.1047538914598234, "step": 3690}, {"loss": 1.5114, "grad_norm": 0.5448206067085266, "learning_rate": 0.0002, "epoch": 3.113167858645351, "step": 3700}, {"loss": 1.4692, "grad_norm": 0.5760458707809448, "learning_rate": 0.0002, "epoch": 3.121581825830879, "step": 3710}, {"loss": 1.4789, "grad_norm": 0.6018968224525452, "learning_rate": 0.0002, "epoch": 3.1299957930164073, "step": 3720}, {"loss": 1.5518, "grad_norm": 0.5767101049423218, "learning_rate": 0.0002, "epoch": 3.1384097602019354, "step": 3730}, {"loss": 1.5032, "grad_norm": 0.5333963632583618, "learning_rate": 0.0002, "epoch": 3.146823727387463, "step": 3740}, {"loss": 1.4812, "grad_norm": 0.5918396711349487, "learning_rate": 0.0002, "epoch": 3.155237694572991, "step": 3750}, {"loss": 1.4618, "grad_norm": 0.5931203365325928, "learning_rate": 0.0002, "epoch": 3.1636516617585193, "step": 3760}, {"loss": 1.5592, "grad_norm": 0.6562168598175049, "learning_rate": 0.0002, "epoch": 3.172065628944047, "step": 3770}, {"loss": 1.4932, "grad_norm": 0.5820156335830688, "learning_rate": 0.0002, "epoch": 3.180479596129575, "step": 3780}, {"loss": 1.4523, "grad_norm": 0.5784737467765808, "learning_rate": 0.0002, "epoch": 3.188893563315103, "step": 3790}, {"loss": 1.498, "grad_norm": 0.5506529808044434, "learning_rate": 0.0002, "epoch": 3.197307530500631, "step": 3800}, {"loss": 1.4819, "grad_norm": 0.6101595163345337, "learning_rate": 0.0002, "epoch": 3.205721497686159, "step": 3810}, {"loss": 1.5185, "grad_norm": 0.5597806572914124, "learning_rate": 0.0002, "epoch": 3.214135464871687, "step": 3820}, {"loss": 1.5664, "grad_norm": 0.5641011595726013, "learning_rate": 0.0002, "epoch": 3.222549432057215, "step": 3830}, {"loss": 1.4702, "grad_norm": 0.5892080068588257, "learning_rate": 0.0002, "epoch": 3.230963399242743, "step": 3840}, {"loss": 1.4194, "grad_norm": 0.6034760475158691, "learning_rate": 0.0002, "epoch": 3.239377366428271, "step": 3850}, {"loss": 1.5499, "grad_norm": 0.5112439393997192, "learning_rate": 0.0002, "epoch": 3.247791333613799, "step": 3860}, {"loss": 1.5132, "grad_norm": 0.56565922498703, "learning_rate": 0.0002, "epoch": 3.256205300799327, "step": 3870}, {"loss": 1.4892, "grad_norm": 0.6155247092247009, "learning_rate": 0.0002, "epoch": 3.264619267984855, "step": 3880}, {"loss": 1.5118, "grad_norm": 0.6064623594284058, "learning_rate": 0.0002, "epoch": 3.273033235170383, "step": 3890}, {"loss": 1.5236, "grad_norm": 0.6313768029212952, "learning_rate": 0.0002, "epoch": 3.2814472023559107, "step": 3900}, {"loss": 1.5551, "grad_norm": 0.5903939008712769, "learning_rate": 0.0002, "epoch": 3.289861169541439, "step": 3910}, {"loss": 1.5703, "grad_norm": 0.5770667195320129, "learning_rate": 0.0002, "epoch": 3.298275136726967, "step": 3920}, {"loss": 1.5159, "grad_norm": 0.5785196423530579, "learning_rate": 0.0002, "epoch": 3.3066891039124946, "step": 3930}, {"loss": 1.5277, "grad_norm": 0.6468310356140137, "learning_rate": 0.0002, "epoch": 3.3151030710980227, "step": 3940}, {"loss": 1.6002, "grad_norm": 0.6200279593467712, "learning_rate": 0.0002, "epoch": 3.323517038283551, "step": 3950}, {"loss": 1.5264, "grad_norm": 0.5779302716255188, "learning_rate": 0.0002, "epoch": 3.3319310054690785, "step": 3960}, {"loss": 1.4861, "grad_norm": 0.5463796854019165, "learning_rate": 0.0002, "epoch": 3.3403449726546066, "step": 3970}, {"loss": 1.541, "grad_norm": 0.6117855906486511, "learning_rate": 0.0002, "epoch": 3.3487589398401347, "step": 3980}, {"loss": 1.5566, "grad_norm": 0.5554766058921814, "learning_rate": 0.0002, "epoch": 3.357172907025663, "step": 3990}, {"loss": 1.5004, "grad_norm": 0.6012870073318481, "learning_rate": 0.0002, "epoch": 3.3655868742111905, "step": 4000}, {"loss": 1.473, "grad_norm": 0.5443974137306213, "learning_rate": 0.0002, "epoch": 3.3740008413967186, "step": 4010}, {"loss": 1.5139, "grad_norm": 0.6636057496070862, "learning_rate": 0.0002, "epoch": 3.3824148085822463, "step": 4020}, {"loss": 1.5141, "grad_norm": 0.5801246166229248, "learning_rate": 0.0002, "epoch": 3.3908287757677744, "step": 4030}, {"loss": 1.5026, "grad_norm": 0.5668839812278748, "learning_rate": 0.0002, "epoch": 3.3992427429533025, "step": 4040}, {"loss": 1.523, "grad_norm": 0.7763481736183167, "learning_rate": 0.0002, "epoch": 3.4076567101388306, "step": 4050}, {"loss": 1.4932, "grad_norm": 0.6675992608070374, "learning_rate": 0.0002, "epoch": 3.4160706773243583, "step": 4060}, {"loss": 1.4959, "grad_norm": 0.6290077567100525, "learning_rate": 0.0002, "epoch": 3.4244846445098864, "step": 4070}, {"loss": 1.5766, "grad_norm": 0.6040239930152893, "learning_rate": 0.0002, "epoch": 3.4328986116954145, "step": 4080}, {"loss": 1.5711, "grad_norm": 0.6237877607345581, "learning_rate": 0.0002, "epoch": 3.441312578880942, "step": 4090}, {"loss": 1.4961, "grad_norm": 0.5343508124351501, "learning_rate": 0.0002, "epoch": 3.4497265460664703, "step": 4100}, {"loss": 1.5123, "grad_norm": 0.6817412972450256, "learning_rate": 0.0002, "epoch": 3.4581405132519984, "step": 4110}, {"loss": 1.5377, "grad_norm": 0.7115170359611511, "learning_rate": 0.0002, "epoch": 3.466554480437526, "step": 4120}, {"loss": 1.5275, "grad_norm": 0.6127332448959351, "learning_rate": 0.0002, "epoch": 3.4749684476230542, "step": 4130}, {"loss": 1.557, "grad_norm": 0.5745994448661804, "learning_rate": 0.0002, "epoch": 3.4833824148085824, "step": 4140}, {"loss": 1.4873, "grad_norm": 0.6248795390129089, "learning_rate": 0.0002, "epoch": 3.49179638199411, "step": 4150}, {"loss": 1.4885, "grad_norm": 0.5821124911308289, "learning_rate": 0.0002, "epoch": 3.500210349179638, "step": 4160}, {"loss": 1.4937, "grad_norm": 0.561416506767273, "learning_rate": 0.0002, "epoch": 3.5086243163651663, "step": 4170}, {"loss": 1.5453, "grad_norm": 0.5848962664604187, "learning_rate": 0.0002, "epoch": 3.5170382835506944, "step": 4180}, {"loss": 1.5892, "grad_norm": 0.5335569977760315, "learning_rate": 0.0002, "epoch": 3.525452250736222, "step": 4190}, {"loss": 1.5152, "grad_norm": 0.547964870929718, "learning_rate": 0.0002, "epoch": 3.53386621792175, "step": 4200}, {"loss": 1.4887, "grad_norm": 0.6157727241516113, "learning_rate": 0.0002, "epoch": 3.542280185107278, "step": 4210}, {"loss": 1.5484, "grad_norm": 0.6163121461868286, "learning_rate": 0.0002, "epoch": 3.550694152292806, "step": 4220}, {"loss": 1.5833, "grad_norm": 0.5844616293907166, "learning_rate": 0.0002, "epoch": 3.559108119478334, "step": 4230}, {"loss": 1.5305, "grad_norm": 0.7104926109313965, "learning_rate": 0.0002, "epoch": 3.567522086663862, "step": 4240}, {"loss": 1.5161, "grad_norm": 0.5055213570594788, "learning_rate": 0.0002, "epoch": 3.57593605384939, "step": 4250}, {"loss": 1.482, "grad_norm": 0.611676812171936, "learning_rate": 0.0002, "epoch": 3.584350021034918, "step": 4260}, {"loss": 1.5048, "grad_norm": 0.6326440572738647, "learning_rate": 0.0002, "epoch": 3.592763988220446, "step": 4270}, {"loss": 1.5122, "grad_norm": 0.6290925741195679, "learning_rate": 0.0002, "epoch": 3.6011779554059737, "step": 4280}, {"loss": 1.5654, "grad_norm": 0.5691978931427002, "learning_rate": 0.0002, "epoch": 3.609591922591502, "step": 4290}, {"loss": 1.4854, "grad_norm": 0.6071329116821289, "learning_rate": 0.0002, "epoch": 3.61800588977703, "step": 4300}, {"loss": 1.5336, "grad_norm": 0.606573224067688, "learning_rate": 0.0002, "epoch": 3.626419856962558, "step": 4310}, {"loss": 1.6437, "grad_norm": 0.5515419244766235, "learning_rate": 0.0002, "epoch": 3.6348338241480858, "step": 4320}, {"loss": 1.498, "grad_norm": 0.5964660048484802, "learning_rate": 0.0002, "epoch": 3.643247791333614, "step": 4330}, {"loss": 1.544, "grad_norm": 0.5774146914482117, "learning_rate": 0.0002, "epoch": 3.6516617585191415, "step": 4340}, {"loss": 1.5566, "grad_norm": 0.5732731223106384, "learning_rate": 0.0002, "epoch": 3.6600757257046697, "step": 4350}, {"loss": 1.5682, "grad_norm": 0.7354163527488708, "learning_rate": 0.0002, "epoch": 3.6684896928901978, "step": 4360}, {"loss": 1.5225, "grad_norm": 0.6220902800559998, "learning_rate": 0.0002, "epoch": 3.676903660075726, "step": 4370}, {"loss": 1.4838, "grad_norm": 0.6053991317749023, "learning_rate": 0.0002, "epoch": 3.6853176272612536, "step": 4380}, {"loss": 1.5161, "grad_norm": 0.67010897397995, "learning_rate": 0.0002, "epoch": 3.6937315944467817, "step": 4390}, {"loss": 1.5381, "grad_norm": 0.6139186024665833, "learning_rate": 0.0002, "epoch": 3.70214556163231, "step": 4400}, {"loss": 1.5088, "grad_norm": 0.5433071851730347, "learning_rate": 0.0002, "epoch": 3.7105595288178375, "step": 4410}, {"loss": 1.5337, "grad_norm": 0.5453870296478271, "learning_rate": 0.0002, "epoch": 3.7189734960033656, "step": 4420}, {"loss": 1.4549, "grad_norm": 0.6401727199554443, "learning_rate": 0.0002, "epoch": 3.7273874631888937, "step": 4430}, {"loss": 1.503, "grad_norm": 0.6049367189407349, "learning_rate": 0.0002, "epoch": 3.735801430374422, "step": 4440}, {"loss": 1.5268, "grad_norm": 0.5740529298782349, "learning_rate": 0.0002, "epoch": 3.7442153975599495, "step": 4450}, {"loss": 1.5183, "grad_norm": 0.6521880626678467, "learning_rate": 0.0002, "epoch": 3.7526293647454776, "step": 4460}, {"loss": 1.5741, "grad_norm": 0.7096368074417114, "learning_rate": 0.0002, "epoch": 3.7610433319310053, "step": 4470}, {"loss": 1.5786, "grad_norm": 0.5886474251747131, "learning_rate": 0.0002, "epoch": 3.7694572991165334, "step": 4480}, {"loss": 1.5887, "grad_norm": 0.5821043252944946, "learning_rate": 0.0002, "epoch": 3.7778712663020615, "step": 4490}, {"loss": 1.5777, "grad_norm": 0.628892183303833, "learning_rate": 0.0002, "epoch": 3.7862852334875896, "step": 4500}, {"loss": 1.4708, "grad_norm": 0.5962669849395752, "learning_rate": 0.0002, "epoch": 3.7946992006731173, "step": 4510}, {"loss": 1.5267, "grad_norm": 0.6635549068450928, "learning_rate": 0.0002, "epoch": 3.8031131678586454, "step": 4520}, {"loss": 1.5058, "grad_norm": 0.6010760068893433, "learning_rate": 0.0002, "epoch": 3.811527135044173, "step": 4530}, {"loss": 1.6228, "grad_norm": 0.6322658658027649, "learning_rate": 0.0002, "epoch": 3.819941102229701, "step": 4540}, {"loss": 1.5029, "grad_norm": 0.5893137454986572, "learning_rate": 0.0002, "epoch": 3.8283550694152293, "step": 4550}, {"loss": 1.5435, "grad_norm": 0.7829602360725403, "learning_rate": 0.0002, "epoch": 3.8367690366007574, "step": 4560}, {"loss": 1.5453, "grad_norm": 0.6190396547317505, "learning_rate": 0.0002, "epoch": 3.845183003786285, "step": 4570}, {"loss": 1.5292, "grad_norm": 0.6662813425064087, "learning_rate": 0.0002, "epoch": 3.853596970971813, "step": 4580}, {"loss": 1.5065, "grad_norm": 0.5809855461120605, "learning_rate": 0.0002, "epoch": 3.8620109381573413, "step": 4590}, {"loss": 1.5041, "grad_norm": 0.5779069662094116, "learning_rate": 0.0002, "epoch": 3.870424905342869, "step": 4600}, {"loss": 1.498, "grad_norm": 0.5603038668632507, "learning_rate": 0.0002, "epoch": 3.878838872528397, "step": 4610}, {"loss": 1.5372, "grad_norm": 0.6274181008338928, "learning_rate": 0.0002, "epoch": 3.887252839713925, "step": 4620}, {"loss": 1.4996, "grad_norm": 0.6810959577560425, "learning_rate": 0.0002, "epoch": 3.8956668068994533, "step": 4630}, {"loss": 1.4956, "grad_norm": 0.5647315979003906, "learning_rate": 0.0002, "epoch": 3.904080774084981, "step": 4640}, {"loss": 1.5424, "grad_norm": 0.6830295324325562, "learning_rate": 0.0002, "epoch": 3.912494741270509, "step": 4650}, {"loss": 1.535, "grad_norm": 0.652565598487854, "learning_rate": 0.0002, "epoch": 3.920908708456037, "step": 4660}, {"loss": 1.4772, "grad_norm": 0.5806284546852112, "learning_rate": 0.0002, "epoch": 3.929322675641565, "step": 4670}, {"loss": 1.5812, "grad_norm": 0.6825073957443237, "learning_rate": 0.0002, "epoch": 3.937736642827093, "step": 4680}, {"loss": 1.5516, "grad_norm": 0.6149451732635498, "learning_rate": 0.0002, "epoch": 3.946150610012621, "step": 4690}, {"loss": 1.5608, "grad_norm": 0.6152557134628296, "learning_rate": 0.0002, "epoch": 3.954564577198149, "step": 4700}, {"loss": 1.4897, "grad_norm": 0.6239011883735657, "learning_rate": 0.0002, "epoch": 3.962978544383677, "step": 4710}, {"loss": 1.538, "grad_norm": 0.6485443115234375, "learning_rate": 0.0002, "epoch": 3.971392511569205, "step": 4720}, {"loss": 1.5226, "grad_norm": 0.6449228525161743, "learning_rate": 0.0002, "epoch": 3.9798064787547327, "step": 4730}, {"loss": 1.5087, "grad_norm": 0.6526407599449158, "learning_rate": 0.0002, "epoch": 3.988220445940261, "step": 4740}, {"loss": 1.5026, "grad_norm": 0.6277706027030945, "learning_rate": 0.0002, "epoch": 3.996634413125789, "step": 4750}, {"eval_loss": 1.871641755104065, "eval_runtime": 37.9637, "eval_samples_per_second": 13.566, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4754}, {"loss": 1.4744, "grad_norm": 0.6994837522506714, "learning_rate": 0.0002, "epoch": 4.005048380311317, "step": 4760}, {"loss": 1.4433, "grad_norm": 0.8728373050689697, "learning_rate": 0.0002, "epoch": 4.013462347496845, "step": 4770}, {"loss": 1.3329, "grad_norm": 0.688679575920105, "learning_rate": 0.0002, "epoch": 4.021876314682372, "step": 4780}, {"loss": 1.3999, "grad_norm": 0.6313387155532837, "learning_rate": 0.0002, "epoch": 4.0302902818679005, "step": 4790}, {"loss": 1.3346, "grad_norm": 0.6577984690666199, "learning_rate": 0.0002, "epoch": 4.038704249053429, "step": 4800}, {"loss": 1.3403, "grad_norm": 0.7938185930252075, "learning_rate": 0.0002, "epoch": 4.047118216238957, "step": 4810}, {"loss": 1.3716, "grad_norm": 0.760399580001831, "learning_rate": 0.0002, "epoch": 4.055532183424485, "step": 4820}, {"loss": 1.4321, "grad_norm": 0.7329602241516113, "learning_rate": 0.0002, "epoch": 4.063946150610013, "step": 4830}, {"loss": 1.4133, "grad_norm": 0.7778576016426086, "learning_rate": 0.0002, "epoch": 4.07236011779554, "step": 4840}, {"loss": 1.4372, "grad_norm": 0.8235865235328674, "learning_rate": 0.0002, "epoch": 4.080774084981068, "step": 4850}, {"loss": 1.3719, "grad_norm": 0.7743754386901855, "learning_rate": 0.0002, "epoch": 4.089188052166596, "step": 4860}, {"loss": 1.3787, "grad_norm": 0.8145367503166199, "learning_rate": 0.0002, "epoch": 4.0976020193521245, "step": 4870}, {"loss": 1.356, "grad_norm": 0.8517307639122009, "learning_rate": 0.0002, "epoch": 4.106015986537653, "step": 4880}, {"loss": 1.4191, "grad_norm": 0.8208953142166138, "learning_rate": 0.0002, "epoch": 4.114429953723181, "step": 4890}, {"loss": 1.3189, "grad_norm": 0.8437790870666504, "learning_rate": 0.0002, "epoch": 4.122843920908709, "step": 4900}, {"loss": 1.3987, "grad_norm": 0.716672420501709, "learning_rate": 0.0002, "epoch": 4.131257888094236, "step": 4910}, {"loss": 1.4392, "grad_norm": 0.7656235098838806, "learning_rate": 0.0002, "epoch": 4.139671855279764, "step": 4920}, {"loss": 1.3408, "grad_norm": 0.7209306955337524, "learning_rate": 0.0002, "epoch": 4.148085822465292, "step": 4930}, {"loss": 1.3639, "grad_norm": 0.7731267809867859, "learning_rate": 0.0002, "epoch": 4.1564997896508205, "step": 4940}, {"loss": 1.4151, "grad_norm": 0.7477553486824036, "learning_rate": 0.0002, "epoch": 4.164913756836349, "step": 4950}, {"loss": 1.3485, "grad_norm": 0.7372981309890747, "learning_rate": 0.0002, "epoch": 4.173327724021877, "step": 4960}, {"loss": 1.3901, "grad_norm": 0.6582154035568237, "learning_rate": 0.0002, "epoch": 4.181741691207404, "step": 4970}, {"loss": 1.3343, "grad_norm": 0.7003206610679626, "learning_rate": 0.0002, "epoch": 4.190155658392932, "step": 4980}, {"loss": 1.4098, "grad_norm": 0.735223650932312, "learning_rate": 0.0002, "epoch": 4.19856962557846, "step": 4990}, {"loss": 1.3564, "grad_norm": 0.7832302451133728, "learning_rate": 0.0002, "epoch": 4.206983592763988, "step": 5000}, {"loss": 1.3622, "grad_norm": 0.8819546103477478, "learning_rate": 0.0002, "epoch": 4.215397559949516, "step": 5010}, {"loss": 1.4438, "grad_norm": 0.9325336813926697, "learning_rate": 0.0002, "epoch": 4.2238115271350445, "step": 5020}, {"loss": 1.3886, "grad_norm": 0.7007517218589783, "learning_rate": 0.0002, "epoch": 4.232225494320572, "step": 5030}, {"loss": 1.3683, "grad_norm": 0.7118321061134338, "learning_rate": 0.0002, "epoch": 4.2406394615061, "step": 5040}, {"loss": 1.2365, "grad_norm": 0.6578946709632874, "learning_rate": 0.0002, "epoch": 4.249053428691628, "step": 5050}, {"loss": 1.3696, "grad_norm": 0.9438983798027039, "learning_rate": 0.0002, "epoch": 4.257467395877156, "step": 5060}, {"loss": 1.3868, "grad_norm": 0.703037679195404, "learning_rate": 0.0002, "epoch": 4.265881363062684, "step": 5070}, {"loss": 1.3687, "grad_norm": 0.7286025285720825, "learning_rate": 0.0002, "epoch": 4.274295330248212, "step": 5080}, {"loss": 1.3605, "grad_norm": 0.750689685344696, "learning_rate": 0.0002, "epoch": 4.28270929743374, "step": 5090}, {"loss": 1.5089, "grad_norm": 0.869753360748291, "learning_rate": 0.0002, "epoch": 4.291123264619268, "step": 5100}, {"loss": 1.4128, "grad_norm": 0.8712980151176453, "learning_rate": 0.0002, "epoch": 4.299537231804796, "step": 5110}, {"loss": 1.3977, "grad_norm": 0.690263569355011, "learning_rate": 0.0002, "epoch": 4.307951198990324, "step": 5120}, {"loss": 1.4088, "grad_norm": 0.7114760279655457, "learning_rate": 0.0002, "epoch": 4.316365166175852, "step": 5130}, {"loss": 1.363, "grad_norm": 0.7588112354278564, "learning_rate": 0.0002, "epoch": 4.32477913336138, "step": 5140}, {"loss": 1.4408, "grad_norm": 0.7556202411651611, "learning_rate": 0.0002, "epoch": 4.333193100546908, "step": 5150}, {"loss": 1.4203, "grad_norm": 0.8357610702514648, "learning_rate": 0.0002, "epoch": 4.341607067732435, "step": 5160}, {"loss": 1.3348, "grad_norm": 0.8054035902023315, "learning_rate": 0.0002, "epoch": 4.3500210349179635, "step": 5170}, {"loss": 1.3109, "grad_norm": 0.7637107968330383, "learning_rate": 0.0002, "epoch": 4.358435002103492, "step": 5180}, {"loss": 1.3744, "grad_norm": 0.757481038570404, "learning_rate": 0.0002, "epoch": 4.36684896928902, "step": 5190}, {"loss": 1.3622, "grad_norm": 0.7185863852500916, "learning_rate": 0.0002, "epoch": 4.375262936474548, "step": 5200}, {"loss": 1.3896, "grad_norm": 0.7326455116271973, "learning_rate": 0.0002, "epoch": 4.383676903660076, "step": 5210}, {"loss": 1.4098, "grad_norm": 0.7980523109436035, "learning_rate": 0.0002, "epoch": 4.392090870845603, "step": 5220}, {"loss": 1.3783, "grad_norm": 0.8526999354362488, "learning_rate": 0.0002, "epoch": 4.400504838031131, "step": 5230}, {"loss": 1.4022, "grad_norm": 0.7012337446212769, "learning_rate": 0.0002, "epoch": 4.4089188052166595, "step": 5240}, {"loss": 1.3552, "grad_norm": 0.8217827677726746, "learning_rate": 0.0002, "epoch": 4.417332772402188, "step": 5250}, {"loss": 1.3482, "grad_norm": 0.7141005396842957, "learning_rate": 0.0002, "epoch": 4.425746739587716, "step": 5260}, {"loss": 1.3699, "grad_norm": 0.7094302177429199, "learning_rate": 0.0002, "epoch": 4.434160706773244, "step": 5270}, {"loss": 1.3527, "grad_norm": 0.7234613299369812, "learning_rate": 0.0002, "epoch": 4.442574673958772, "step": 5280}, {"loss": 1.4769, "grad_norm": 0.7530457973480225, "learning_rate": 0.0002, "epoch": 4.450988641144299, "step": 5290}, {"loss": 1.3944, "grad_norm": 0.7300912141799927, "learning_rate": 0.0002, "epoch": 4.459402608329827, "step": 5300}, {"loss": 1.3844, "grad_norm": 0.825443685054779, "learning_rate": 0.0002, "epoch": 4.467816575515355, "step": 5310}, {"loss": 1.3648, "grad_norm": 0.7559658885002136, "learning_rate": 0.0002, "epoch": 4.4762305427008835, "step": 5320}, {"loss": 1.4364, "grad_norm": 0.8817561268806458, "learning_rate": 0.0002, "epoch": 4.484644509886412, "step": 5330}, {"loss": 1.3618, "grad_norm": 0.8203575611114502, "learning_rate": 0.0002, "epoch": 4.49305847707194, "step": 5340}, {"loss": 1.3996, "grad_norm": 0.7677690982818604, "learning_rate": 0.0002, "epoch": 4.501472444257468, "step": 5350}, {"loss": 1.4142, "grad_norm": 0.657085120677948, "learning_rate": 0.0002, "epoch": 4.509886411442995, "step": 5360}, {"loss": 1.3722, "grad_norm": 0.7939504384994507, "learning_rate": 0.0002, "epoch": 4.518300378628523, "step": 5370}, {"loss": 1.4361, "grad_norm": 0.6971889138221741, "learning_rate": 0.0002, "epoch": 4.526714345814051, "step": 5380}, {"loss": 1.3637, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 4.535128312999579, "step": 5390}, {"loss": 1.341, "grad_norm": 0.8504858613014221, "learning_rate": 0.0002, "epoch": 4.5435422801851075, "step": 5400}, {"loss": 1.4026, "grad_norm": 0.9134073853492737, "learning_rate": 0.0002, "epoch": 4.551956247370635, "step": 5410}, {"loss": 1.4375, "grad_norm": 0.7765598893165588, "learning_rate": 0.0002, "epoch": 4.560370214556163, "step": 5420}, {"loss": 1.4832, "grad_norm": 0.6991009712219238, "learning_rate": 0.0002, "epoch": 4.568784181741691, "step": 5430}, {"loss": 1.4021, "grad_norm": 0.8393039107322693, "learning_rate": 0.0002, "epoch": 4.577198148927219, "step": 5440}, {"loss": 1.3976, "grad_norm": 0.7685918211936951, "learning_rate": 0.0002, "epoch": 4.585612116112747, "step": 5450}, {"loss": 1.3883, "grad_norm": 0.7135679721832275, "learning_rate": 0.0002, "epoch": 4.594026083298275, "step": 5460}, {"loss": 1.4083, "grad_norm": 0.6728870868682861, "learning_rate": 0.0002, "epoch": 4.6024400504838034, "step": 5470}, {"loss": 1.3698, "grad_norm": 0.7139479517936707, "learning_rate": 0.0002, "epoch": 4.610854017669331, "step": 5480}, {"loss": 1.3498, "grad_norm": 0.8476598858833313, "learning_rate": 0.0002, "epoch": 4.619267984854859, "step": 5490}, {"loss": 1.3389, "grad_norm": 0.8034361004829407, "learning_rate": 0.0002, "epoch": 4.627681952040387, "step": 5500}, {"loss": 1.4179, "grad_norm": 0.7452183961868286, "learning_rate": 0.0002, "epoch": 4.636095919225915, "step": 5510}, {"loss": 1.4031, "grad_norm": 0.8394148945808411, "learning_rate": 0.0002, "epoch": 4.644509886411443, "step": 5520}, {"loss": 1.4561, "grad_norm": 0.7480153441429138, "learning_rate": 0.0002, "epoch": 4.652923853596971, "step": 5530}, {"loss": 1.378, "grad_norm": 0.7781714797019958, "learning_rate": 0.0002, "epoch": 4.661337820782499, "step": 5540}, {"loss": 1.3924, "grad_norm": 1.0058213472366333, "learning_rate": 0.0002, "epoch": 4.669751787968027, "step": 5550}, {"loss": 1.4198, "grad_norm": 0.7403179407119751, "learning_rate": 0.0002, "epoch": 4.678165755153555, "step": 5560}, {"loss": 1.4328, "grad_norm": 0.7270476818084717, "learning_rate": 0.0002, "epoch": 4.686579722339083, "step": 5570}, {"loss": 1.378, "grad_norm": 0.760877788066864, "learning_rate": 0.0002, "epoch": 4.694993689524611, "step": 5580}, {"loss": 1.387, "grad_norm": 0.8097004890441895, "learning_rate": 0.0002, "epoch": 4.703407656710139, "step": 5590}, {"loss": 1.3661, "grad_norm": 0.9096523523330688, "learning_rate": 0.0002, "epoch": 4.711821623895667, "step": 5600}, {"loss": 1.4012, "grad_norm": 0.7262444496154785, "learning_rate": 0.0002, "epoch": 4.720235591081195, "step": 5610}, {"loss": 1.422, "grad_norm": 0.8207762837409973, "learning_rate": 0.0002, "epoch": 4.7286495582667225, "step": 5620}, {"loss": 1.4017, "grad_norm": 0.8089601993560791, "learning_rate": 0.0002, "epoch": 4.737063525452251, "step": 5630}, {"loss": 1.3675, "grad_norm": 0.7609543800354004, "learning_rate": 0.0002, "epoch": 4.745477492637779, "step": 5640}, {"loss": 1.4085, "grad_norm": 0.7273501753807068, "learning_rate": 0.0002, "epoch": 4.753891459823307, "step": 5650}, {"loss": 1.3849, "grad_norm": 0.7800219058990479, "learning_rate": 0.0002, "epoch": 4.762305427008835, "step": 5660}, {"loss": 1.4319, "grad_norm": 0.8558377623558044, "learning_rate": 0.0002, "epoch": 4.770719394194362, "step": 5670}, {"loss": 1.3831, "grad_norm": 0.7131547927856445, "learning_rate": 0.0002, "epoch": 4.77913336137989, "step": 5680}, {"loss": 1.407, "grad_norm": 0.7651025056838989, "learning_rate": 0.0002, "epoch": 4.787547328565418, "step": 5690}, {"loss": 1.3882, "grad_norm": 0.8129976391792297, "learning_rate": 0.0002, "epoch": 4.7959612957509465, "step": 5700}, {"loss": 1.4347, "grad_norm": 0.8019895553588867, "learning_rate": 0.0002, "epoch": 4.804375262936475, "step": 5710}, {"loss": 1.3961, "grad_norm": 0.7692018151283264, "learning_rate": 0.0002, "epoch": 4.812789230122003, "step": 5720}, {"loss": 1.419, "grad_norm": 0.6893943548202515, "learning_rate": 0.0002, "epoch": 4.821203197307531, "step": 5730}, {"loss": 1.4453, "grad_norm": 0.6881810426712036, "learning_rate": 0.0002, "epoch": 4.829617164493058, "step": 5740}, {"loss": 1.4775, "grad_norm": 0.7838267683982849, "learning_rate": 0.0002, "epoch": 4.838031131678586, "step": 5750}, {"loss": 1.3857, "grad_norm": 0.727799117565155, "learning_rate": 0.0002, "epoch": 4.846445098864114, "step": 5760}, {"loss": 1.4685, "grad_norm": 0.7458277344703674, "learning_rate": 0.0002, "epoch": 4.8548590660496425, "step": 5770}, {"loss": 1.4426, "grad_norm": 0.903802216053009, "learning_rate": 0.0002, "epoch": 4.863273033235171, "step": 5780}, {"loss": 1.451, "grad_norm": 0.7983472347259521, "learning_rate": 0.0002, "epoch": 4.871687000420699, "step": 5790}, {"loss": 1.4534, "grad_norm": 0.6894361972808838, "learning_rate": 0.0002, "epoch": 4.880100967606227, "step": 5800}, {"loss": 1.4486, "grad_norm": 0.7499409317970276, "learning_rate": 0.0002, "epoch": 4.888514934791754, "step": 5810}, {"loss": 1.4253, "grad_norm": 0.7362820506095886, "learning_rate": 0.0002, "epoch": 4.896928901977282, "step": 5820}, {"loss": 1.3763, "grad_norm": 0.8341619968414307, "learning_rate": 0.0002, "epoch": 4.90534286916281, "step": 5830}, {"loss": 1.3748, "grad_norm": 0.9604470133781433, "learning_rate": 0.0002, "epoch": 4.913756836348338, "step": 5840}, {"loss": 1.3658, "grad_norm": 0.8916844129562378, "learning_rate": 0.0002, "epoch": 4.9221708035338665, "step": 5850}, {"loss": 1.363, "grad_norm": 0.8519647121429443, "learning_rate": 0.0002, "epoch": 4.930584770719394, "step": 5860}, {"loss": 1.424, "grad_norm": 0.7946906089782715, "learning_rate": 0.0002, "epoch": 4.938998737904922, "step": 5870}, {"loss": 1.4071, "grad_norm": 0.7843789458274841, "learning_rate": 0.0002, "epoch": 4.94741270509045, "step": 5880}, {"loss": 1.4021, "grad_norm": 0.707618772983551, "learning_rate": 0.0002, "epoch": 4.955826672275978, "step": 5890}, {"loss": 1.502, "grad_norm": 0.7704206109046936, "learning_rate": 0.0002, "epoch": 4.964240639461506, "step": 5900}, {"loss": 1.4456, "grad_norm": 0.7160256505012512, "learning_rate": 0.0002, "epoch": 4.972654606647034, "step": 5910}, {"loss": 1.3874, "grad_norm": 0.7020420432090759, "learning_rate": 0.0002, "epoch": 4.981068573832562, "step": 5920}, {"loss": 1.4037, "grad_norm": 0.7576286792755127, "learning_rate": 0.0002, "epoch": 4.98948254101809, "step": 5930}, {"loss": 1.414, "grad_norm": 0.8573036789894104, "learning_rate": 0.0002, "epoch": 4.997896508203618, "step": 5940}, {"eval_loss": 1.9353811740875244, "eval_runtime": 37.9208, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 4.999579301640724, "step": 5942}, {"loss": 1.2418, "grad_norm": 0.8204267621040344, "learning_rate": 0.0002, "epoch": 5.006310475389146, "step": 5950}, {"loss": 1.235, "grad_norm": 0.976840615272522, "learning_rate": 0.0002, "epoch": 5.014724442574674, "step": 5960}, {"loss": 1.2134, "grad_norm": 0.8765613436698914, "learning_rate": 0.0002, "epoch": 5.023138409760202, "step": 5970}, {"loss": 1.2748, "grad_norm": 1.1793042421340942, "learning_rate": 0.0002, "epoch": 5.03155237694573, "step": 5980}, {"loss": 1.2412, "grad_norm": 0.971062958240509, "learning_rate": 0.0002, "epoch": 5.039966344131258, "step": 5990}, {"loss": 1.1819, "grad_norm": 0.8649757504463196, "learning_rate": 0.0002, "epoch": 5.0483803113167856, "step": 6000}, {"loss": 1.1654, "grad_norm": 0.9563034176826477, "learning_rate": 0.0002, "epoch": 5.056794278502314, "step": 6010}, {"loss": 1.2238, "grad_norm": 1.0093994140625, "learning_rate": 0.0002, "epoch": 5.065208245687842, "step": 6020}, {"loss": 1.2519, "grad_norm": 1.004213571548462, "learning_rate": 0.0002, "epoch": 5.07362221287337, "step": 6030}, {"loss": 1.2379, "grad_norm": 0.8307787179946899, "learning_rate": 0.0002, "epoch": 5.082036180058898, "step": 6040}, {"loss": 1.2282, "grad_norm": 0.9117848873138428, "learning_rate": 0.0002, "epoch": 5.090450147244426, "step": 6050}, {"loss": 1.2582, "grad_norm": 1.0269840955734253, "learning_rate": 0.0002, "epoch": 5.098864114429953, "step": 6060}, {"loss": 1.1836, "grad_norm": 0.9079542756080627, "learning_rate": 0.0002, "epoch": 5.1072780816154815, "step": 6070}, {"loss": 1.215, "grad_norm": 0.885702908039093, "learning_rate": 0.0002, "epoch": 5.11569204880101, "step": 6080}, {"loss": 1.2406, "grad_norm": 0.9976128339767456, "learning_rate": 0.0002, "epoch": 5.124106015986538, "step": 6090}, {"loss": 1.3082, "grad_norm": 0.8472117185592651, "learning_rate": 0.0002, "epoch": 5.132519983172066, "step": 6100}, {"loss": 1.226, "grad_norm": 1.0385161638259888, "learning_rate": 0.0002, "epoch": 5.140933950357594, "step": 6110}, {"loss": 1.213, "grad_norm": 0.8948383927345276, "learning_rate": 0.0002, "epoch": 5.149347917543121, "step": 6120}, {"loss": 1.2213, "grad_norm": 1.2613716125488281, "learning_rate": 0.0002, "epoch": 5.157761884728649, "step": 6130}, {"loss": 1.2632, "grad_norm": 0.9933410286903381, "learning_rate": 0.0002, "epoch": 5.166175851914177, "step": 6140}, {"loss": 1.1715, "grad_norm": 0.9673663973808289, "learning_rate": 0.0002, "epoch": 5.1745898190997055, "step": 6150}, {"loss": 1.2947, "grad_norm": 0.9969648122787476, "learning_rate": 0.0002, "epoch": 5.183003786285234, "step": 6160}, {"loss": 1.2416, "grad_norm": 1.2163258790969849, "learning_rate": 0.0002, "epoch": 5.191417753470762, "step": 6170}, {"loss": 1.2221, "grad_norm": 0.9163419604301453, "learning_rate": 0.0002, "epoch": 5.19983172065629, "step": 6180}, {"loss": 1.2624, "grad_norm": 0.9225585460662842, "learning_rate": 0.0002, "epoch": 5.208245687841817, "step": 6190}, {"loss": 1.2932, "grad_norm": 0.9205296635627747, "learning_rate": 0.0002, "epoch": 5.216659655027345, "step": 6200}, {"loss": 1.1825, "grad_norm": 1.0655443668365479, "learning_rate": 0.0002, "epoch": 5.225073622212873, "step": 6210}, {"loss": 1.2613, "grad_norm": 1.0854865312576294, "learning_rate": 0.0002, "epoch": 5.233487589398401, "step": 6220}, {"loss": 1.3045, "grad_norm": 0.8489186763763428, "learning_rate": 0.0002, "epoch": 5.2419015565839295, "step": 6230}, {"loss": 1.2708, "grad_norm": 0.910391628742218, "learning_rate": 0.0002, "epoch": 5.250315523769458, "step": 6240}, {"loss": 1.1914, "grad_norm": 0.925507128238678, "learning_rate": 0.0002, "epoch": 5.258729490954985, "step": 6250}, {"loss": 1.3368, "grad_norm": 1.1069735288619995, "learning_rate": 0.0002, "epoch": 5.267143458140513, "step": 6260}, {"loss": 1.2505, "grad_norm": 0.9705119132995605, "learning_rate": 0.0002, "epoch": 5.275557425326041, "step": 6270}, {"loss": 1.2602, "grad_norm": 0.9752426147460938, "learning_rate": 0.0002, "epoch": 5.283971392511569, "step": 6280}, {"loss": 1.2043, "grad_norm": 1.021359920501709, "learning_rate": 0.0002, "epoch": 5.292385359697097, "step": 6290}, {"loss": 1.2848, "grad_norm": 1.148606300354004, "learning_rate": 0.0002, "epoch": 5.3007993268826255, "step": 6300}, {"loss": 1.2201, "grad_norm": 0.8909247517585754, "learning_rate": 0.0002, "epoch": 5.309213294068153, "step": 6310}, {"loss": 1.2376, "grad_norm": 0.9879156351089478, "learning_rate": 0.0002, "epoch": 5.317627261253681, "step": 6320}, {"loss": 1.2638, "grad_norm": 0.9473357200622559, "learning_rate": 0.0002, "epoch": 5.326041228439209, "step": 6330}, {"loss": 1.232, "grad_norm": 1.1422028541564941, "learning_rate": 0.0002, "epoch": 5.334455195624737, "step": 6340}, {"loss": 1.263, "grad_norm": 0.9942235350608826, "learning_rate": 0.0002, "epoch": 5.342869162810265, "step": 6350}, {"loss": 1.3032, "grad_norm": 0.9535723924636841, "learning_rate": 0.0002, "epoch": 5.351283129995793, "step": 6360}, {"loss": 1.2908, "grad_norm": 0.9020892381668091, "learning_rate": 0.0002, "epoch": 5.359697097181321, "step": 6370}, {"loss": 1.2023, "grad_norm": 1.0626472234725952, "learning_rate": 0.0002, "epoch": 5.368111064366849, "step": 6380}, {"loss": 1.2555, "grad_norm": 1.1395848989486694, "learning_rate": 0.0002, "epoch": 5.376525031552377, "step": 6390}, {"loss": 1.2839, "grad_norm": 0.9274451732635498, "learning_rate": 0.0002, "epoch": 5.384938998737905, "step": 6400}, {"loss": 1.2819, "grad_norm": 0.8108699917793274, "learning_rate": 0.0002, "epoch": 5.393352965923433, "step": 6410}, {"loss": 1.2589, "grad_norm": 1.1805564165115356, "learning_rate": 0.0002, "epoch": 5.401766933108961, "step": 6420}, {"loss": 1.3549, "grad_norm": 0.8321298360824585, "learning_rate": 0.0002, "epoch": 5.410180900294489, "step": 6430}, {"loss": 1.2925, "grad_norm": 0.8981925249099731, "learning_rate": 0.0002, "epoch": 5.418594867480017, "step": 6440}, {"loss": 1.258, "grad_norm": 1.0730986595153809, "learning_rate": 0.0002, "epoch": 5.4270088346655445, "step": 6450}, {"loss": 1.26, "grad_norm": 1.0584609508514404, "learning_rate": 0.0002, "epoch": 5.435422801851073, "step": 6460}, {"loss": 1.2847, "grad_norm": 1.0792299509048462, "learning_rate": 0.0002, "epoch": 5.443836769036601, "step": 6470}, {"loss": 1.2035, "grad_norm": 0.9101872444152832, "learning_rate": 0.0002, "epoch": 5.452250736222129, "step": 6480}, {"loss": 1.2574, "grad_norm": 0.9910100698471069, "learning_rate": 0.0002, "epoch": 5.460664703407657, "step": 6490}, {"loss": 1.3098, "grad_norm": 1.041412353515625, "learning_rate": 0.0002, "epoch": 5.469078670593185, "step": 6500}, {"loss": 1.2812, "grad_norm": 1.0091687440872192, "learning_rate": 0.0002, "epoch": 5.477492637778712, "step": 6510}, {"loss": 1.2523, "grad_norm": 0.8755383491516113, "learning_rate": 0.0002, "epoch": 5.48590660496424, "step": 6520}, {"loss": 1.3042, "grad_norm": 0.980212390422821, "learning_rate": 0.0002, "epoch": 5.4943205721497685, "step": 6530}, {"loss": 1.2873, "grad_norm": 0.9356869459152222, "learning_rate": 0.0002, "epoch": 5.502734539335297, "step": 6540}, {"loss": 1.2254, "grad_norm": 0.9008095264434814, "learning_rate": 0.0002, "epoch": 5.511148506520825, "step": 6550}, {"loss": 1.2818, "grad_norm": 0.8908938765525818, "learning_rate": 0.0002, "epoch": 5.519562473706353, "step": 6560}, {"loss": 1.2212, "grad_norm": 1.1423932313919067, "learning_rate": 0.0002, "epoch": 5.52797644089188, "step": 6570}, {"loss": 1.3039, "grad_norm": 1.0508161783218384, "learning_rate": 0.0002, "epoch": 5.536390408077408, "step": 6580}, {"loss": 1.2446, "grad_norm": 0.8357517719268799, "learning_rate": 0.0002, "epoch": 5.544804375262936, "step": 6590}, {"loss": 1.3037, "grad_norm": 0.9892540574073792, "learning_rate": 0.0002, "epoch": 5.5532183424484645, "step": 6600}, {"loss": 1.3028, "grad_norm": 1.0048326253890991, "learning_rate": 0.0002, "epoch": 5.561632309633993, "step": 6610}, {"loss": 1.2152, "grad_norm": 0.9801995158195496, "learning_rate": 0.0002, "epoch": 5.570046276819521, "step": 6620}, {"loss": 1.2606, "grad_norm": 0.9899214506149292, "learning_rate": 0.0002, "epoch": 5.578460244005049, "step": 6630}, {"loss": 1.2043, "grad_norm": 1.1911814212799072, "learning_rate": 0.0002, "epoch": 5.586874211190576, "step": 6640}, {"loss": 1.3458, "grad_norm": 1.0368894338607788, "learning_rate": 0.0002, "epoch": 5.595288178376104, "step": 6650}, {"loss": 1.2595, "grad_norm": 1.1248382329940796, "learning_rate": 0.0002, "epoch": 5.603702145561632, "step": 6660}, {"loss": 1.2548, "grad_norm": 0.9765539765357971, "learning_rate": 0.0002, "epoch": 5.61211611274716, "step": 6670}, {"loss": 1.3451, "grad_norm": 0.9810206890106201, "learning_rate": 0.0002, "epoch": 5.6205300799326885, "step": 6680}, {"loss": 1.2952, "grad_norm": 1.100386619567871, "learning_rate": 0.0002, "epoch": 5.628944047118217, "step": 6690}, {"loss": 1.2467, "grad_norm": 0.8824519515037537, "learning_rate": 0.0002, "epoch": 5.637358014303744, "step": 6700}, {"loss": 1.25, "grad_norm": 1.0864064693450928, "learning_rate": 0.0002, "epoch": 5.645771981489272, "step": 6710}, {"loss": 1.2479, "grad_norm": 1.1614511013031006, "learning_rate": 0.0002, "epoch": 5.6541859486748, "step": 6720}, {"loss": 1.2753, "grad_norm": 1.0762972831726074, "learning_rate": 0.0002, "epoch": 5.662599915860328, "step": 6730}, {"loss": 1.2741, "grad_norm": 0.9408974647521973, "learning_rate": 0.0002, "epoch": 5.671013883045856, "step": 6740}, {"loss": 1.2431, "grad_norm": 0.8906030058860779, "learning_rate": 0.0002, "epoch": 5.679427850231384, "step": 6750}, {"loss": 1.2643, "grad_norm": 0.9527303576469421, "learning_rate": 0.0002, "epoch": 5.687841817416912, "step": 6760}, {"loss": 1.322, "grad_norm": 0.9471196532249451, "learning_rate": 0.0002, "epoch": 5.69625578460244, "step": 6770}, {"loss": 1.2514, "grad_norm": 0.9186838865280151, "learning_rate": 0.0002, "epoch": 5.704669751787968, "step": 6780}, {"loss": 1.2347, "grad_norm": 0.9225441813468933, "learning_rate": 0.0002, "epoch": 5.713083718973496, "step": 6790}, {"loss": 1.1849, "grad_norm": 0.9712982773780823, "learning_rate": 0.0002, "epoch": 5.721497686159024, "step": 6800}, {"loss": 1.2431, "grad_norm": 1.0743170976638794, "learning_rate": 0.0002, "epoch": 5.729911653344552, "step": 6810}, {"loss": 1.2136, "grad_norm": 1.2738113403320312, "learning_rate": 0.0002, "epoch": 5.73832562053008, "step": 6820}, {"loss": 1.2176, "grad_norm": 0.9386790990829468, "learning_rate": 0.0002, "epoch": 5.7467395877156076, "step": 6830}, {"loss": 1.285, "grad_norm": 1.0817769765853882, "learning_rate": 0.0002, "epoch": 5.755153554901136, "step": 6840}, {"loss": 1.2247, "grad_norm": 1.1040263175964355, "learning_rate": 0.0002, "epoch": 5.763567522086664, "step": 6850}, {"loss": 1.2507, "grad_norm": 1.0656492710113525, "learning_rate": 0.0002, "epoch": 5.771981489272192, "step": 6860}, {"loss": 1.2999, "grad_norm": 0.9550157189369202, "learning_rate": 0.0002, "epoch": 5.78039545645772, "step": 6870}, {"loss": 1.3201, "grad_norm": 1.0130870342254639, "learning_rate": 0.0002, "epoch": 5.788809423643248, "step": 6880}, {"loss": 1.3392, "grad_norm": 1.0675787925720215, "learning_rate": 0.0002, "epoch": 5.797223390828776, "step": 6890}, {"loss": 1.2949, "grad_norm": 0.9537774920463562, "learning_rate": 0.0002, "epoch": 5.8056373580143035, "step": 6900}, {"loss": 1.2658, "grad_norm": 0.9640319347381592, "learning_rate": 0.0002, "epoch": 5.814051325199832, "step": 6910}, {"loss": 1.2199, "grad_norm": 0.8917992115020752, "learning_rate": 0.0002, "epoch": 5.82246529238536, "step": 6920}, {"loss": 1.373, "grad_norm": 0.9881822466850281, "learning_rate": 0.0002, "epoch": 5.830879259570888, "step": 6930}, {"loss": 1.323, "grad_norm": 0.9136882424354553, "learning_rate": 0.0002, "epoch": 5.839293226756416, "step": 6940}, {"loss": 1.3159, "grad_norm": 0.9086098074913025, "learning_rate": 0.0002, "epoch": 5.847707193941943, "step": 6950}, {"loss": 1.2624, "grad_norm": 0.9443018436431885, "learning_rate": 0.0002, "epoch": 5.856121161127471, "step": 6960}, {"loss": 1.3224, "grad_norm": 0.9915381669998169, "learning_rate": 0.0002, "epoch": 5.864535128312999, "step": 6970}, {"loss": 1.337, "grad_norm": 0.8939146995544434, "learning_rate": 0.0002, "epoch": 5.8729490954985275, "step": 6980}, {"loss": 1.2611, "grad_norm": 1.3672245740890503, "learning_rate": 0.0002, "epoch": 5.881363062684056, "step": 6990}, {"loss": 1.3012, "grad_norm": 1.0116257667541504, "learning_rate": 0.0002, "epoch": 5.889777029869584, "step": 7000}, {"loss": 1.3128, "grad_norm": 1.1561565399169922, "learning_rate": 0.0002, "epoch": 5.898190997055112, "step": 7010}, {"loss": 1.2301, "grad_norm": 0.9900678992271423, "learning_rate": 0.0002, "epoch": 5.906604964240639, "step": 7020}, {"loss": 1.2845, "grad_norm": 0.9297345876693726, "learning_rate": 0.0002, "epoch": 5.915018931426167, "step": 7030}, {"loss": 1.2317, "grad_norm": 0.9357825517654419, "learning_rate": 0.0002, "epoch": 5.923432898611695, "step": 7040}, {"loss": 1.2303, "grad_norm": 1.049317717552185, "learning_rate": 0.0002, "epoch": 5.931846865797223, "step": 7050}, {"loss": 1.3243, "grad_norm": 0.950633704662323, "learning_rate": 0.0002, "epoch": 5.9402608329827515, "step": 7060}, {"loss": 1.2758, "grad_norm": 0.854581892490387, "learning_rate": 0.0002, "epoch": 5.94867480016828, "step": 7070}, {"loss": 1.3252, "grad_norm": 0.9097039699554443, "learning_rate": 0.0002, "epoch": 5.957088767353808, "step": 7080}, {"loss": 1.291, "grad_norm": 0.9072173237800598, "learning_rate": 0.0002, "epoch": 5.965502734539335, "step": 7090}, {"loss": 1.2724, "grad_norm": 1.0470727682113647, "learning_rate": 0.0002, "epoch": 5.973916701724863, "step": 7100}, {"loss": 1.3324, "grad_norm": 1.2628462314605713, "learning_rate": 0.0002, "epoch": 5.982330668910391, "step": 7110}, {"loss": 1.2701, "grad_norm": 1.055279016494751, "learning_rate": 0.0002, "epoch": 5.990744636095919, "step": 7120}, {"loss": 1.3234, "grad_norm": 0.966194212436676, "learning_rate": 0.0002, "epoch": 5.9991586032814475, "step": 7130}]} +{"epoch": 6.999579301640724, "step": 8319, "epoch_duration": 1250.1847441196442, "total_accumulated_duration": 9815.15891122818, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}, {"eval_loss": 1.8266887664794922, "eval_runtime": 37.9445, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.9995793016407237, "step": 3565}, {"loss": 1.5939, "grad_norm": 0.4523724615573883, "learning_rate": 0.0002, "epoch": 3.003786285233488, "step": 3570}, {"loss": 1.526, "grad_norm": 0.5261380076408386, "learning_rate": 0.0002, "epoch": 3.0122002524190155, "step": 3580}, {"loss": 1.4946, "grad_norm": 0.48664888739585876, "learning_rate": 0.0002, "epoch": 3.0206142196045436, "step": 3590}, {"loss": 1.5193, "grad_norm": 0.5070882439613342, "learning_rate": 0.0002, "epoch": 3.0290281867900717, "step": 3600}, {"loss": 1.5316, "grad_norm": 0.5816011428833008, "learning_rate": 0.0002, "epoch": 3.0374421539755994, "step": 3610}, {"loss": 1.5682, "grad_norm": 0.6610211730003357, "learning_rate": 0.0002, "epoch": 3.0458561211611275, "step": 3620}, {"loss": 1.5699, "grad_norm": 0.5257703065872192, "learning_rate": 0.0002, "epoch": 3.0542700883466556, "step": 3630}, {"loss": 1.4438, "grad_norm": 0.5574390888214111, "learning_rate": 0.0002, "epoch": 3.0626840555321833, "step": 3640}, {"loss": 1.547, "grad_norm": 0.5682297348976135, "learning_rate": 0.0002, "epoch": 3.0710980227177114, "step": 3650}, {"loss": 1.5743, "grad_norm": 0.5798383355140686, "learning_rate": 0.0002, "epoch": 3.0795119899032395, "step": 3660}, {"loss": 1.4339, "grad_norm": 0.5458289980888367, "learning_rate": 0.0002, "epoch": 3.087925957088767, "step": 3670}, {"loss": 1.46, "grad_norm": 0.5599102973937988, "learning_rate": 0.0002, "epoch": 3.0963399242742953, "step": 3680}, {"loss": 1.4589, "grad_norm": 0.5023021697998047, "learning_rate": 0.0002, "epoch": 3.1047538914598234, "step": 3690}, {"loss": 1.5114, "grad_norm": 0.5448206067085266, "learning_rate": 0.0002, "epoch": 3.113167858645351, "step": 3700}, {"loss": 1.4692, "grad_norm": 0.5760458707809448, "learning_rate": 0.0002, "epoch": 3.121581825830879, "step": 3710}, {"loss": 1.4789, "grad_norm": 0.6018968224525452, "learning_rate": 0.0002, "epoch": 3.1299957930164073, "step": 3720}, {"loss": 1.5518, "grad_norm": 0.5767101049423218, "learning_rate": 0.0002, "epoch": 3.1384097602019354, "step": 3730}, {"loss": 1.5032, "grad_norm": 0.5333963632583618, "learning_rate": 0.0002, "epoch": 3.146823727387463, "step": 3740}, {"loss": 1.4812, "grad_norm": 0.5918396711349487, "learning_rate": 0.0002, "epoch": 3.155237694572991, "step": 3750}, {"loss": 1.4618, "grad_norm": 0.5931203365325928, "learning_rate": 0.0002, "epoch": 3.1636516617585193, "step": 3760}, {"loss": 1.5592, "grad_norm": 0.6562168598175049, "learning_rate": 0.0002, "epoch": 3.172065628944047, "step": 3770}, {"loss": 1.4932, "grad_norm": 0.5820156335830688, "learning_rate": 0.0002, "epoch": 3.180479596129575, "step": 3780}, {"loss": 1.4523, "grad_norm": 0.5784737467765808, "learning_rate": 0.0002, "epoch": 3.188893563315103, "step": 3790}, {"loss": 1.498, "grad_norm": 0.5506529808044434, "learning_rate": 0.0002, "epoch": 3.197307530500631, "step": 3800}, {"loss": 1.4819, "grad_norm": 0.6101595163345337, "learning_rate": 0.0002, "epoch": 3.205721497686159, "step": 3810}, {"loss": 1.5185, "grad_norm": 0.5597806572914124, "learning_rate": 0.0002, "epoch": 3.214135464871687, "step": 3820}, {"loss": 1.5664, "grad_norm": 0.5641011595726013, "learning_rate": 0.0002, "epoch": 3.222549432057215, "step": 3830}, {"loss": 1.4702, "grad_norm": 0.5892080068588257, "learning_rate": 0.0002, "epoch": 3.230963399242743, "step": 3840}, {"loss": 1.4194, "grad_norm": 0.6034760475158691, "learning_rate": 0.0002, "epoch": 3.239377366428271, "step": 3850}, {"loss": 1.5499, "grad_norm": 0.5112439393997192, "learning_rate": 0.0002, "epoch": 3.247791333613799, "step": 3860}, {"loss": 1.5132, "grad_norm": 0.56565922498703, "learning_rate": 0.0002, "epoch": 3.256205300799327, "step": 3870}, {"loss": 1.4892, "grad_norm": 0.6155247092247009, "learning_rate": 0.0002, "epoch": 3.264619267984855, "step": 3880}, {"loss": 1.5118, "grad_norm": 0.6064623594284058, "learning_rate": 0.0002, "epoch": 3.273033235170383, "step": 3890}, {"loss": 1.5236, "grad_norm": 0.6313768029212952, "learning_rate": 0.0002, "epoch": 3.2814472023559107, "step": 3900}, {"loss": 1.5551, "grad_norm": 0.5903939008712769, "learning_rate": 0.0002, "epoch": 3.289861169541439, "step": 3910}, {"loss": 1.5703, "grad_norm": 0.5770667195320129, "learning_rate": 0.0002, "epoch": 3.298275136726967, "step": 3920}, {"loss": 1.5159, "grad_norm": 0.5785196423530579, "learning_rate": 0.0002, "epoch": 3.3066891039124946, "step": 3930}, {"loss": 1.5277, "grad_norm": 0.6468310356140137, "learning_rate": 0.0002, "epoch": 3.3151030710980227, "step": 3940}, {"loss": 1.6002, "grad_norm": 0.6200279593467712, "learning_rate": 0.0002, "epoch": 3.323517038283551, "step": 3950}, {"loss": 1.5264, "grad_norm": 0.5779302716255188, "learning_rate": 0.0002, "epoch": 3.3319310054690785, "step": 3960}, {"loss": 1.4861, "grad_norm": 0.5463796854019165, "learning_rate": 0.0002, "epoch": 3.3403449726546066, "step": 3970}, {"loss": 1.541, "grad_norm": 0.6117855906486511, "learning_rate": 0.0002, "epoch": 3.3487589398401347, "step": 3980}, {"loss": 1.5566, "grad_norm": 0.5554766058921814, "learning_rate": 0.0002, "epoch": 3.357172907025663, "step": 3990}, {"loss": 1.5004, "grad_norm": 0.6012870073318481, "learning_rate": 0.0002, "epoch": 3.3655868742111905, "step": 4000}, {"loss": 1.473, "grad_norm": 0.5443974137306213, "learning_rate": 0.0002, "epoch": 3.3740008413967186, "step": 4010}, {"loss": 1.5139, "grad_norm": 0.6636057496070862, "learning_rate": 0.0002, "epoch": 3.3824148085822463, "step": 4020}, {"loss": 1.5141, "grad_norm": 0.5801246166229248, "learning_rate": 0.0002, "epoch": 3.3908287757677744, "step": 4030}, {"loss": 1.5026, "grad_norm": 0.5668839812278748, "learning_rate": 0.0002, "epoch": 3.3992427429533025, "step": 4040}, {"loss": 1.523, "grad_norm": 0.7763481736183167, "learning_rate": 0.0002, "epoch": 3.4076567101388306, "step": 4050}, {"loss": 1.4932, "grad_norm": 0.6675992608070374, "learning_rate": 0.0002, "epoch": 3.4160706773243583, "step": 4060}, {"loss": 1.4959, "grad_norm": 0.6290077567100525, "learning_rate": 0.0002, "epoch": 3.4244846445098864, "step": 4070}, {"loss": 1.5766, "grad_norm": 0.6040239930152893, "learning_rate": 0.0002, "epoch": 3.4328986116954145, "step": 4080}, {"loss": 1.5711, "grad_norm": 0.6237877607345581, "learning_rate": 0.0002, "epoch": 3.441312578880942, "step": 4090}, {"loss": 1.4961, "grad_norm": 0.5343508124351501, "learning_rate": 0.0002, "epoch": 3.4497265460664703, "step": 4100}, {"loss": 1.5123, "grad_norm": 0.6817412972450256, "learning_rate": 0.0002, "epoch": 3.4581405132519984, "step": 4110}, {"loss": 1.5377, "grad_norm": 0.7115170359611511, "learning_rate": 0.0002, "epoch": 3.466554480437526, "step": 4120}, {"loss": 1.5275, "grad_norm": 0.6127332448959351, "learning_rate": 0.0002, "epoch": 3.4749684476230542, "step": 4130}, {"loss": 1.557, "grad_norm": 0.5745994448661804, "learning_rate": 0.0002, "epoch": 3.4833824148085824, "step": 4140}, {"loss": 1.4873, "grad_norm": 0.6248795390129089, "learning_rate": 0.0002, "epoch": 3.49179638199411, "step": 4150}, {"loss": 1.4885, "grad_norm": 0.5821124911308289, "learning_rate": 0.0002, "epoch": 3.500210349179638, "step": 4160}, {"loss": 1.4937, "grad_norm": 0.561416506767273, "learning_rate": 0.0002, "epoch": 3.5086243163651663, "step": 4170}, {"loss": 1.5453, "grad_norm": 0.5848962664604187, "learning_rate": 0.0002, "epoch": 3.5170382835506944, "step": 4180}, {"loss": 1.5892, "grad_norm": 0.5335569977760315, "learning_rate": 0.0002, "epoch": 3.525452250736222, "step": 4190}, {"loss": 1.5152, "grad_norm": 0.547964870929718, "learning_rate": 0.0002, "epoch": 3.53386621792175, "step": 4200}, {"loss": 1.4887, "grad_norm": 0.6157727241516113, "learning_rate": 0.0002, "epoch": 3.542280185107278, "step": 4210}, {"loss": 1.5484, "grad_norm": 0.6163121461868286, "learning_rate": 0.0002, "epoch": 3.550694152292806, "step": 4220}, {"loss": 1.5833, "grad_norm": 0.5844616293907166, "learning_rate": 0.0002, "epoch": 3.559108119478334, "step": 4230}, {"loss": 1.5305, "grad_norm": 0.7104926109313965, "learning_rate": 0.0002, "epoch": 3.567522086663862, "step": 4240}, {"loss": 1.5161, "grad_norm": 0.5055213570594788, "learning_rate": 0.0002, "epoch": 3.57593605384939, "step": 4250}, {"loss": 1.482, "grad_norm": 0.611676812171936, "learning_rate": 0.0002, "epoch": 3.584350021034918, "step": 4260}, {"loss": 1.5048, "grad_norm": 0.6326440572738647, "learning_rate": 0.0002, "epoch": 3.592763988220446, "step": 4270}, {"loss": 1.5122, "grad_norm": 0.6290925741195679, "learning_rate": 0.0002, "epoch": 3.6011779554059737, "step": 4280}, {"loss": 1.5654, "grad_norm": 0.5691978931427002, "learning_rate": 0.0002, "epoch": 3.609591922591502, "step": 4290}, {"loss": 1.4854, "grad_norm": 0.6071329116821289, "learning_rate": 0.0002, "epoch": 3.61800588977703, "step": 4300}, {"loss": 1.5336, "grad_norm": 0.606573224067688, "learning_rate": 0.0002, "epoch": 3.626419856962558, "step": 4310}, {"loss": 1.6437, "grad_norm": 0.5515419244766235, "learning_rate": 0.0002, "epoch": 3.6348338241480858, "step": 4320}, {"loss": 1.498, "grad_norm": 0.5964660048484802, "learning_rate": 0.0002, "epoch": 3.643247791333614, "step": 4330}, {"loss": 1.544, "grad_norm": 0.5774146914482117, "learning_rate": 0.0002, "epoch": 3.6516617585191415, "step": 4340}, {"loss": 1.5566, "grad_norm": 0.5732731223106384, "learning_rate": 0.0002, "epoch": 3.6600757257046697, "step": 4350}, {"loss": 1.5682, "grad_norm": 0.7354163527488708, "learning_rate": 0.0002, "epoch": 3.6684896928901978, "step": 4360}, {"loss": 1.5225, "grad_norm": 0.6220902800559998, "learning_rate": 0.0002, "epoch": 3.676903660075726, "step": 4370}, {"loss": 1.4838, "grad_norm": 0.6053991317749023, "learning_rate": 0.0002, "epoch": 3.6853176272612536, "step": 4380}, {"loss": 1.5161, "grad_norm": 0.67010897397995, "learning_rate": 0.0002, "epoch": 3.6937315944467817, "step": 4390}, {"loss": 1.5381, "grad_norm": 0.6139186024665833, "learning_rate": 0.0002, "epoch": 3.70214556163231, "step": 4400}, {"loss": 1.5088, "grad_norm": 0.5433071851730347, "learning_rate": 0.0002, "epoch": 3.7105595288178375, "step": 4410}, {"loss": 1.5337, "grad_norm": 0.5453870296478271, "learning_rate": 0.0002, "epoch": 3.7189734960033656, "step": 4420}, {"loss": 1.4549, "grad_norm": 0.6401727199554443, "learning_rate": 0.0002, "epoch": 3.7273874631888937, "step": 4430}, {"loss": 1.503, "grad_norm": 0.6049367189407349, "learning_rate": 0.0002, "epoch": 3.735801430374422, "step": 4440}, {"loss": 1.5268, "grad_norm": 0.5740529298782349, "learning_rate": 0.0002, "epoch": 3.7442153975599495, "step": 4450}, {"loss": 1.5183, "grad_norm": 0.6521880626678467, "learning_rate": 0.0002, "epoch": 3.7526293647454776, "step": 4460}, {"loss": 1.5741, "grad_norm": 0.7096368074417114, "learning_rate": 0.0002, "epoch": 3.7610433319310053, "step": 4470}, {"loss": 1.5786, "grad_norm": 0.5886474251747131, "learning_rate": 0.0002, "epoch": 3.7694572991165334, "step": 4480}, {"loss": 1.5887, "grad_norm": 0.5821043252944946, "learning_rate": 0.0002, "epoch": 3.7778712663020615, "step": 4490}, {"loss": 1.5777, "grad_norm": 0.628892183303833, "learning_rate": 0.0002, "epoch": 3.7862852334875896, "step": 4500}, {"loss": 1.4708, "grad_norm": 0.5962669849395752, "learning_rate": 0.0002, "epoch": 3.7946992006731173, "step": 4510}, {"loss": 1.5267, "grad_norm": 0.6635549068450928, "learning_rate": 0.0002, "epoch": 3.8031131678586454, "step": 4520}, {"loss": 1.5058, "grad_norm": 0.6010760068893433, "learning_rate": 0.0002, "epoch": 3.811527135044173, "step": 4530}, {"loss": 1.6228, "grad_norm": 0.6322658658027649, "learning_rate": 0.0002, "epoch": 3.819941102229701, "step": 4540}, {"loss": 1.5029, "grad_norm": 0.5893137454986572, "learning_rate": 0.0002, "epoch": 3.8283550694152293, "step": 4550}, {"loss": 1.5435, "grad_norm": 0.7829602360725403, "learning_rate": 0.0002, "epoch": 3.8367690366007574, "step": 4560}, {"loss": 1.5453, "grad_norm": 0.6190396547317505, "learning_rate": 0.0002, "epoch": 3.845183003786285, "step": 4570}, {"loss": 1.5292, "grad_norm": 0.6662813425064087, "learning_rate": 0.0002, "epoch": 3.853596970971813, "step": 4580}, {"loss": 1.5065, "grad_norm": 0.5809855461120605, "learning_rate": 0.0002, "epoch": 3.8620109381573413, "step": 4590}, {"loss": 1.5041, "grad_norm": 0.5779069662094116, "learning_rate": 0.0002, "epoch": 3.870424905342869, "step": 4600}, {"loss": 1.498, "grad_norm": 0.5603038668632507, "learning_rate": 0.0002, "epoch": 3.878838872528397, "step": 4610}, {"loss": 1.5372, "grad_norm": 0.6274181008338928, "learning_rate": 0.0002, "epoch": 3.887252839713925, "step": 4620}, {"loss": 1.4996, "grad_norm": 0.6810959577560425, "learning_rate": 0.0002, "epoch": 3.8956668068994533, "step": 4630}, {"loss": 1.4956, "grad_norm": 0.5647315979003906, "learning_rate": 0.0002, "epoch": 3.904080774084981, "step": 4640}, {"loss": 1.5424, "grad_norm": 0.6830295324325562, "learning_rate": 0.0002, "epoch": 3.912494741270509, "step": 4650}, {"loss": 1.535, "grad_norm": 0.652565598487854, "learning_rate": 0.0002, "epoch": 3.920908708456037, "step": 4660}, {"loss": 1.4772, "grad_norm": 0.5806284546852112, "learning_rate": 0.0002, "epoch": 3.929322675641565, "step": 4670}, {"loss": 1.5812, "grad_norm": 0.6825073957443237, "learning_rate": 0.0002, "epoch": 3.937736642827093, "step": 4680}, {"loss": 1.5516, "grad_norm": 0.6149451732635498, "learning_rate": 0.0002, "epoch": 3.946150610012621, "step": 4690}, {"loss": 1.5608, "grad_norm": 0.6152557134628296, "learning_rate": 0.0002, "epoch": 3.954564577198149, "step": 4700}, {"loss": 1.4897, "grad_norm": 0.6239011883735657, "learning_rate": 0.0002, "epoch": 3.962978544383677, "step": 4710}, {"loss": 1.538, "grad_norm": 0.6485443115234375, "learning_rate": 0.0002, "epoch": 3.971392511569205, "step": 4720}, {"loss": 1.5226, "grad_norm": 0.6449228525161743, "learning_rate": 0.0002, "epoch": 3.9798064787547327, "step": 4730}, {"loss": 1.5087, "grad_norm": 0.6526407599449158, "learning_rate": 0.0002, "epoch": 3.988220445940261, "step": 4740}, {"loss": 1.5026, "grad_norm": 0.6277706027030945, "learning_rate": 0.0002, "epoch": 3.996634413125789, "step": 4750}, {"eval_loss": 1.871641755104065, "eval_runtime": 37.9637, "eval_samples_per_second": 13.566, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4754}, {"loss": 1.4744, "grad_norm": 0.6994837522506714, "learning_rate": 0.0002, "epoch": 4.005048380311317, "step": 4760}, {"loss": 1.4433, "grad_norm": 0.8728373050689697, "learning_rate": 0.0002, "epoch": 4.013462347496845, "step": 4770}, {"loss": 1.3329, "grad_norm": 0.688679575920105, "learning_rate": 0.0002, "epoch": 4.021876314682372, "step": 4780}, {"loss": 1.3999, "grad_norm": 0.6313387155532837, "learning_rate": 0.0002, "epoch": 4.0302902818679005, "step": 4790}, {"loss": 1.3346, "grad_norm": 0.6577984690666199, "learning_rate": 0.0002, "epoch": 4.038704249053429, "step": 4800}, {"loss": 1.3403, "grad_norm": 0.7938185930252075, "learning_rate": 0.0002, "epoch": 4.047118216238957, "step": 4810}, {"loss": 1.3716, "grad_norm": 0.760399580001831, "learning_rate": 0.0002, "epoch": 4.055532183424485, "step": 4820}, {"loss": 1.4321, "grad_norm": 0.7329602241516113, "learning_rate": 0.0002, "epoch": 4.063946150610013, "step": 4830}, {"loss": 1.4133, "grad_norm": 0.7778576016426086, "learning_rate": 0.0002, "epoch": 4.07236011779554, "step": 4840}, {"loss": 1.4372, "grad_norm": 0.8235865235328674, "learning_rate": 0.0002, "epoch": 4.080774084981068, "step": 4850}, {"loss": 1.3719, "grad_norm": 0.7743754386901855, "learning_rate": 0.0002, "epoch": 4.089188052166596, "step": 4860}, {"loss": 1.3787, "grad_norm": 0.8145367503166199, "learning_rate": 0.0002, "epoch": 4.0976020193521245, "step": 4870}, {"loss": 1.356, "grad_norm": 0.8517307639122009, "learning_rate": 0.0002, "epoch": 4.106015986537653, "step": 4880}, {"loss": 1.4191, "grad_norm": 0.8208953142166138, "learning_rate": 0.0002, "epoch": 4.114429953723181, "step": 4890}, {"loss": 1.3189, "grad_norm": 0.8437790870666504, "learning_rate": 0.0002, "epoch": 4.122843920908709, "step": 4900}, {"loss": 1.3987, "grad_norm": 0.716672420501709, "learning_rate": 0.0002, "epoch": 4.131257888094236, "step": 4910}, {"loss": 1.4392, "grad_norm": 0.7656235098838806, "learning_rate": 0.0002, "epoch": 4.139671855279764, "step": 4920}, {"loss": 1.3408, "grad_norm": 0.7209306955337524, "learning_rate": 0.0002, "epoch": 4.148085822465292, "step": 4930}, {"loss": 1.3639, "grad_norm": 0.7731267809867859, "learning_rate": 0.0002, "epoch": 4.1564997896508205, "step": 4940}, {"loss": 1.4151, "grad_norm": 0.7477553486824036, "learning_rate": 0.0002, "epoch": 4.164913756836349, "step": 4950}, {"loss": 1.3485, "grad_norm": 0.7372981309890747, "learning_rate": 0.0002, "epoch": 4.173327724021877, "step": 4960}, {"loss": 1.3901, "grad_norm": 0.6582154035568237, "learning_rate": 0.0002, "epoch": 4.181741691207404, "step": 4970}, {"loss": 1.3343, "grad_norm": 0.7003206610679626, "learning_rate": 0.0002, "epoch": 4.190155658392932, "step": 4980}, {"loss": 1.4098, "grad_norm": 0.735223650932312, "learning_rate": 0.0002, "epoch": 4.19856962557846, "step": 4990}, {"loss": 1.3564, "grad_norm": 0.7832302451133728, "learning_rate": 0.0002, "epoch": 4.206983592763988, "step": 5000}, {"loss": 1.3622, "grad_norm": 0.8819546103477478, "learning_rate": 0.0002, "epoch": 4.215397559949516, "step": 5010}, {"loss": 1.4438, "grad_norm": 0.9325336813926697, "learning_rate": 0.0002, "epoch": 4.2238115271350445, "step": 5020}, {"loss": 1.3886, "grad_norm": 0.7007517218589783, "learning_rate": 0.0002, "epoch": 4.232225494320572, "step": 5030}, {"loss": 1.3683, "grad_norm": 0.7118321061134338, "learning_rate": 0.0002, "epoch": 4.2406394615061, "step": 5040}, {"loss": 1.2365, "grad_norm": 0.6578946709632874, "learning_rate": 0.0002, "epoch": 4.249053428691628, "step": 5050}, {"loss": 1.3696, "grad_norm": 0.9438983798027039, "learning_rate": 0.0002, "epoch": 4.257467395877156, "step": 5060}, {"loss": 1.3868, "grad_norm": 0.703037679195404, "learning_rate": 0.0002, "epoch": 4.265881363062684, "step": 5070}, {"loss": 1.3687, "grad_norm": 0.7286025285720825, "learning_rate": 0.0002, "epoch": 4.274295330248212, "step": 5080}, {"loss": 1.3605, "grad_norm": 0.750689685344696, "learning_rate": 0.0002, "epoch": 4.28270929743374, "step": 5090}, {"loss": 1.5089, "grad_norm": 0.869753360748291, "learning_rate": 0.0002, "epoch": 4.291123264619268, "step": 5100}, {"loss": 1.4128, "grad_norm": 0.8712980151176453, "learning_rate": 0.0002, "epoch": 4.299537231804796, "step": 5110}, {"loss": 1.3977, "grad_norm": 0.690263569355011, "learning_rate": 0.0002, "epoch": 4.307951198990324, "step": 5120}, {"loss": 1.4088, "grad_norm": 0.7114760279655457, "learning_rate": 0.0002, "epoch": 4.316365166175852, "step": 5130}, {"loss": 1.363, "grad_norm": 0.7588112354278564, "learning_rate": 0.0002, "epoch": 4.32477913336138, "step": 5140}, {"loss": 1.4408, "grad_norm": 0.7556202411651611, "learning_rate": 0.0002, "epoch": 4.333193100546908, "step": 5150}, {"loss": 1.4203, "grad_norm": 0.8357610702514648, "learning_rate": 0.0002, "epoch": 4.341607067732435, "step": 5160}, {"loss": 1.3348, "grad_norm": 0.8054035902023315, "learning_rate": 0.0002, "epoch": 4.3500210349179635, "step": 5170}, {"loss": 1.3109, "grad_norm": 0.7637107968330383, "learning_rate": 0.0002, "epoch": 4.358435002103492, "step": 5180}, {"loss": 1.3744, "grad_norm": 0.757481038570404, "learning_rate": 0.0002, "epoch": 4.36684896928902, "step": 5190}, {"loss": 1.3622, "grad_norm": 0.7185863852500916, "learning_rate": 0.0002, "epoch": 4.375262936474548, "step": 5200}, {"loss": 1.3896, "grad_norm": 0.7326455116271973, "learning_rate": 0.0002, "epoch": 4.383676903660076, "step": 5210}, {"loss": 1.4098, "grad_norm": 0.7980523109436035, "learning_rate": 0.0002, "epoch": 4.392090870845603, "step": 5220}, {"loss": 1.3783, "grad_norm": 0.8526999354362488, "learning_rate": 0.0002, "epoch": 4.400504838031131, "step": 5230}, {"loss": 1.4022, "grad_norm": 0.7012337446212769, "learning_rate": 0.0002, "epoch": 4.4089188052166595, "step": 5240}, {"loss": 1.3552, "grad_norm": 0.8217827677726746, "learning_rate": 0.0002, "epoch": 4.417332772402188, "step": 5250}, {"loss": 1.3482, "grad_norm": 0.7141005396842957, "learning_rate": 0.0002, "epoch": 4.425746739587716, "step": 5260}, {"loss": 1.3699, "grad_norm": 0.7094302177429199, "learning_rate": 0.0002, "epoch": 4.434160706773244, "step": 5270}, {"loss": 1.3527, "grad_norm": 0.7234613299369812, "learning_rate": 0.0002, "epoch": 4.442574673958772, "step": 5280}, {"loss": 1.4769, "grad_norm": 0.7530457973480225, "learning_rate": 0.0002, "epoch": 4.450988641144299, "step": 5290}, {"loss": 1.3944, "grad_norm": 0.7300912141799927, "learning_rate": 0.0002, "epoch": 4.459402608329827, "step": 5300}, {"loss": 1.3844, "grad_norm": 0.825443685054779, "learning_rate": 0.0002, "epoch": 4.467816575515355, "step": 5310}, {"loss": 1.3648, "grad_norm": 0.7559658885002136, "learning_rate": 0.0002, "epoch": 4.4762305427008835, "step": 5320}, {"loss": 1.4364, "grad_norm": 0.8817561268806458, "learning_rate": 0.0002, "epoch": 4.484644509886412, "step": 5330}, {"loss": 1.3618, "grad_norm": 0.8203575611114502, "learning_rate": 0.0002, "epoch": 4.49305847707194, "step": 5340}, {"loss": 1.3996, "grad_norm": 0.7677690982818604, "learning_rate": 0.0002, "epoch": 4.501472444257468, "step": 5350}, {"loss": 1.4142, "grad_norm": 0.657085120677948, "learning_rate": 0.0002, "epoch": 4.509886411442995, "step": 5360}, {"loss": 1.3722, "grad_norm": 0.7939504384994507, "learning_rate": 0.0002, "epoch": 4.518300378628523, "step": 5370}, {"loss": 1.4361, "grad_norm": 0.6971889138221741, "learning_rate": 0.0002, "epoch": 4.526714345814051, "step": 5380}, {"loss": 1.3637, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 4.535128312999579, "step": 5390}, {"loss": 1.341, "grad_norm": 0.8504858613014221, "learning_rate": 0.0002, "epoch": 4.5435422801851075, "step": 5400}, {"loss": 1.4026, "grad_norm": 0.9134073853492737, "learning_rate": 0.0002, "epoch": 4.551956247370635, "step": 5410}, {"loss": 1.4375, "grad_norm": 0.7765598893165588, "learning_rate": 0.0002, "epoch": 4.560370214556163, "step": 5420}, {"loss": 1.4832, "grad_norm": 0.6991009712219238, "learning_rate": 0.0002, "epoch": 4.568784181741691, "step": 5430}, {"loss": 1.4021, "grad_norm": 0.8393039107322693, "learning_rate": 0.0002, "epoch": 4.577198148927219, "step": 5440}, {"loss": 1.3976, "grad_norm": 0.7685918211936951, "learning_rate": 0.0002, "epoch": 4.585612116112747, "step": 5450}, {"loss": 1.3883, "grad_norm": 0.7135679721832275, "learning_rate": 0.0002, "epoch": 4.594026083298275, "step": 5460}, {"loss": 1.4083, "grad_norm": 0.6728870868682861, "learning_rate": 0.0002, "epoch": 4.6024400504838034, "step": 5470}, {"loss": 1.3698, "grad_norm": 0.7139479517936707, "learning_rate": 0.0002, "epoch": 4.610854017669331, "step": 5480}, {"loss": 1.3498, "grad_norm": 0.8476598858833313, "learning_rate": 0.0002, "epoch": 4.619267984854859, "step": 5490}, {"loss": 1.3389, "grad_norm": 0.8034361004829407, "learning_rate": 0.0002, "epoch": 4.627681952040387, "step": 5500}, {"loss": 1.4179, "grad_norm": 0.7452183961868286, "learning_rate": 0.0002, "epoch": 4.636095919225915, "step": 5510}, {"loss": 1.4031, "grad_norm": 0.8394148945808411, "learning_rate": 0.0002, "epoch": 4.644509886411443, "step": 5520}, {"loss": 1.4561, "grad_norm": 0.7480153441429138, "learning_rate": 0.0002, "epoch": 4.652923853596971, "step": 5530}, {"loss": 1.378, "grad_norm": 0.7781714797019958, "learning_rate": 0.0002, "epoch": 4.661337820782499, "step": 5540}, {"loss": 1.3924, "grad_norm": 1.0058213472366333, "learning_rate": 0.0002, "epoch": 4.669751787968027, "step": 5550}, {"loss": 1.4198, "grad_norm": 0.7403179407119751, "learning_rate": 0.0002, "epoch": 4.678165755153555, "step": 5560}, {"loss": 1.4328, "grad_norm": 0.7270476818084717, "learning_rate": 0.0002, "epoch": 4.686579722339083, "step": 5570}, {"loss": 1.378, "grad_norm": 0.760877788066864, "learning_rate": 0.0002, "epoch": 4.694993689524611, "step": 5580}, {"loss": 1.387, "grad_norm": 0.8097004890441895, "learning_rate": 0.0002, "epoch": 4.703407656710139, "step": 5590}, {"loss": 1.3661, "grad_norm": 0.9096523523330688, "learning_rate": 0.0002, "epoch": 4.711821623895667, "step": 5600}, {"loss": 1.4012, "grad_norm": 0.7262444496154785, "learning_rate": 0.0002, "epoch": 4.720235591081195, "step": 5610}, {"loss": 1.422, "grad_norm": 0.8207762837409973, "learning_rate": 0.0002, "epoch": 4.7286495582667225, "step": 5620}, {"loss": 1.4017, "grad_norm": 0.8089601993560791, "learning_rate": 0.0002, "epoch": 4.737063525452251, "step": 5630}, {"loss": 1.3675, "grad_norm": 0.7609543800354004, "learning_rate": 0.0002, "epoch": 4.745477492637779, "step": 5640}, {"loss": 1.4085, "grad_norm": 0.7273501753807068, "learning_rate": 0.0002, "epoch": 4.753891459823307, "step": 5650}, {"loss": 1.3849, "grad_norm": 0.7800219058990479, "learning_rate": 0.0002, "epoch": 4.762305427008835, "step": 5660}, {"loss": 1.4319, "grad_norm": 0.8558377623558044, "learning_rate": 0.0002, "epoch": 4.770719394194362, "step": 5670}, {"loss": 1.3831, "grad_norm": 0.7131547927856445, "learning_rate": 0.0002, "epoch": 4.77913336137989, "step": 5680}, {"loss": 1.407, "grad_norm": 0.7651025056838989, "learning_rate": 0.0002, "epoch": 4.787547328565418, "step": 5690}, {"loss": 1.3882, "grad_norm": 0.8129976391792297, "learning_rate": 0.0002, "epoch": 4.7959612957509465, "step": 5700}, {"loss": 1.4347, "grad_norm": 0.8019895553588867, "learning_rate": 0.0002, "epoch": 4.804375262936475, "step": 5710}, {"loss": 1.3961, "grad_norm": 0.7692018151283264, "learning_rate": 0.0002, "epoch": 4.812789230122003, "step": 5720}, {"loss": 1.419, "grad_norm": 0.6893943548202515, "learning_rate": 0.0002, "epoch": 4.821203197307531, "step": 5730}, {"loss": 1.4453, "grad_norm": 0.6881810426712036, "learning_rate": 0.0002, "epoch": 4.829617164493058, "step": 5740}, {"loss": 1.4775, "grad_norm": 0.7838267683982849, "learning_rate": 0.0002, "epoch": 4.838031131678586, "step": 5750}, {"loss": 1.3857, "grad_norm": 0.727799117565155, "learning_rate": 0.0002, "epoch": 4.846445098864114, "step": 5760}, {"loss": 1.4685, "grad_norm": 0.7458277344703674, "learning_rate": 0.0002, "epoch": 4.8548590660496425, "step": 5770}, {"loss": 1.4426, "grad_norm": 0.903802216053009, "learning_rate": 0.0002, "epoch": 4.863273033235171, "step": 5780}, {"loss": 1.451, "grad_norm": 0.7983472347259521, "learning_rate": 0.0002, "epoch": 4.871687000420699, "step": 5790}, {"loss": 1.4534, "grad_norm": 0.6894361972808838, "learning_rate": 0.0002, "epoch": 4.880100967606227, "step": 5800}, {"loss": 1.4486, "grad_norm": 0.7499409317970276, "learning_rate": 0.0002, "epoch": 4.888514934791754, "step": 5810}, {"loss": 1.4253, "grad_norm": 0.7362820506095886, "learning_rate": 0.0002, "epoch": 4.896928901977282, "step": 5820}, {"loss": 1.3763, "grad_norm": 0.8341619968414307, "learning_rate": 0.0002, "epoch": 4.90534286916281, "step": 5830}, {"loss": 1.3748, "grad_norm": 0.9604470133781433, "learning_rate": 0.0002, "epoch": 4.913756836348338, "step": 5840}, {"loss": 1.3658, "grad_norm": 0.8916844129562378, "learning_rate": 0.0002, "epoch": 4.9221708035338665, "step": 5850}, {"loss": 1.363, "grad_norm": 0.8519647121429443, "learning_rate": 0.0002, "epoch": 4.930584770719394, "step": 5860}, {"loss": 1.424, "grad_norm": 0.7946906089782715, "learning_rate": 0.0002, "epoch": 4.938998737904922, "step": 5870}, {"loss": 1.4071, "grad_norm": 0.7843789458274841, "learning_rate": 0.0002, "epoch": 4.94741270509045, "step": 5880}, {"loss": 1.4021, "grad_norm": 0.707618772983551, "learning_rate": 0.0002, "epoch": 4.955826672275978, "step": 5890}, {"loss": 1.502, "grad_norm": 0.7704206109046936, "learning_rate": 0.0002, "epoch": 4.964240639461506, "step": 5900}, {"loss": 1.4456, "grad_norm": 0.7160256505012512, "learning_rate": 0.0002, "epoch": 4.972654606647034, "step": 5910}, {"loss": 1.3874, "grad_norm": 0.7020420432090759, "learning_rate": 0.0002, "epoch": 4.981068573832562, "step": 5920}, {"loss": 1.4037, "grad_norm": 0.7576286792755127, "learning_rate": 0.0002, "epoch": 4.98948254101809, "step": 5930}, {"loss": 1.414, "grad_norm": 0.8573036789894104, "learning_rate": 0.0002, "epoch": 4.997896508203618, "step": 5940}, {"eval_loss": 1.9353811740875244, "eval_runtime": 37.9208, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 4.999579301640724, "step": 5942}, {"loss": 1.2418, "grad_norm": 0.8204267621040344, "learning_rate": 0.0002, "epoch": 5.006310475389146, "step": 5950}, {"loss": 1.235, "grad_norm": 0.976840615272522, "learning_rate": 0.0002, "epoch": 5.014724442574674, "step": 5960}, {"loss": 1.2134, "grad_norm": 0.8765613436698914, "learning_rate": 0.0002, "epoch": 5.023138409760202, "step": 5970}, {"loss": 1.2748, "grad_norm": 1.1793042421340942, "learning_rate": 0.0002, "epoch": 5.03155237694573, "step": 5980}, {"loss": 1.2412, "grad_norm": 0.971062958240509, "learning_rate": 0.0002, "epoch": 5.039966344131258, "step": 5990}, {"loss": 1.1819, "grad_norm": 0.8649757504463196, "learning_rate": 0.0002, "epoch": 5.0483803113167856, "step": 6000}, {"loss": 1.1654, "grad_norm": 0.9563034176826477, "learning_rate": 0.0002, "epoch": 5.056794278502314, "step": 6010}, {"loss": 1.2238, "grad_norm": 1.0093994140625, "learning_rate": 0.0002, "epoch": 5.065208245687842, "step": 6020}, {"loss": 1.2519, "grad_norm": 1.004213571548462, "learning_rate": 0.0002, "epoch": 5.07362221287337, "step": 6030}, {"loss": 1.2379, "grad_norm": 0.8307787179946899, "learning_rate": 0.0002, "epoch": 5.082036180058898, "step": 6040}, {"loss": 1.2282, "grad_norm": 0.9117848873138428, "learning_rate": 0.0002, "epoch": 5.090450147244426, "step": 6050}, {"loss": 1.2582, "grad_norm": 1.0269840955734253, "learning_rate": 0.0002, "epoch": 5.098864114429953, "step": 6060}, {"loss": 1.1836, "grad_norm": 0.9079542756080627, "learning_rate": 0.0002, "epoch": 5.1072780816154815, "step": 6070}, {"loss": 1.215, "grad_norm": 0.885702908039093, "learning_rate": 0.0002, "epoch": 5.11569204880101, "step": 6080}, {"loss": 1.2406, "grad_norm": 0.9976128339767456, "learning_rate": 0.0002, "epoch": 5.124106015986538, "step": 6090}, {"loss": 1.3082, "grad_norm": 0.8472117185592651, "learning_rate": 0.0002, "epoch": 5.132519983172066, "step": 6100}, {"loss": 1.226, "grad_norm": 1.0385161638259888, "learning_rate": 0.0002, "epoch": 5.140933950357594, "step": 6110}, {"loss": 1.213, "grad_norm": 0.8948383927345276, "learning_rate": 0.0002, "epoch": 5.149347917543121, "step": 6120}, {"loss": 1.2213, "grad_norm": 1.2613716125488281, "learning_rate": 0.0002, "epoch": 5.157761884728649, "step": 6130}, {"loss": 1.2632, "grad_norm": 0.9933410286903381, "learning_rate": 0.0002, "epoch": 5.166175851914177, "step": 6140}, {"loss": 1.1715, "grad_norm": 0.9673663973808289, "learning_rate": 0.0002, "epoch": 5.1745898190997055, "step": 6150}, {"loss": 1.2947, "grad_norm": 0.9969648122787476, "learning_rate": 0.0002, "epoch": 5.183003786285234, "step": 6160}, {"loss": 1.2416, "grad_norm": 1.2163258790969849, "learning_rate": 0.0002, "epoch": 5.191417753470762, "step": 6170}, {"loss": 1.2221, "grad_norm": 0.9163419604301453, "learning_rate": 0.0002, "epoch": 5.19983172065629, "step": 6180}, {"loss": 1.2624, "grad_norm": 0.9225585460662842, "learning_rate": 0.0002, "epoch": 5.208245687841817, "step": 6190}, {"loss": 1.2932, "grad_norm": 0.9205296635627747, "learning_rate": 0.0002, "epoch": 5.216659655027345, "step": 6200}, {"loss": 1.1825, "grad_norm": 1.0655443668365479, "learning_rate": 0.0002, "epoch": 5.225073622212873, "step": 6210}, {"loss": 1.2613, "grad_norm": 1.0854865312576294, "learning_rate": 0.0002, "epoch": 5.233487589398401, "step": 6220}, {"loss": 1.3045, "grad_norm": 0.8489186763763428, "learning_rate": 0.0002, "epoch": 5.2419015565839295, "step": 6230}, {"loss": 1.2708, "grad_norm": 0.910391628742218, "learning_rate": 0.0002, "epoch": 5.250315523769458, "step": 6240}, {"loss": 1.1914, "grad_norm": 0.925507128238678, "learning_rate": 0.0002, "epoch": 5.258729490954985, "step": 6250}, {"loss": 1.3368, "grad_norm": 1.1069735288619995, "learning_rate": 0.0002, "epoch": 5.267143458140513, "step": 6260}, {"loss": 1.2505, "grad_norm": 0.9705119132995605, "learning_rate": 0.0002, "epoch": 5.275557425326041, "step": 6270}, {"loss": 1.2602, "grad_norm": 0.9752426147460938, "learning_rate": 0.0002, "epoch": 5.283971392511569, "step": 6280}, {"loss": 1.2043, "grad_norm": 1.021359920501709, "learning_rate": 0.0002, "epoch": 5.292385359697097, "step": 6290}, {"loss": 1.2848, "grad_norm": 1.148606300354004, "learning_rate": 0.0002, "epoch": 5.3007993268826255, "step": 6300}, {"loss": 1.2201, "grad_norm": 0.8909247517585754, "learning_rate": 0.0002, "epoch": 5.309213294068153, "step": 6310}, {"loss": 1.2376, "grad_norm": 0.9879156351089478, "learning_rate": 0.0002, "epoch": 5.317627261253681, "step": 6320}, {"loss": 1.2638, "grad_norm": 0.9473357200622559, "learning_rate": 0.0002, "epoch": 5.326041228439209, "step": 6330}, {"loss": 1.232, "grad_norm": 1.1422028541564941, "learning_rate": 0.0002, "epoch": 5.334455195624737, "step": 6340}, {"loss": 1.263, "grad_norm": 0.9942235350608826, "learning_rate": 0.0002, "epoch": 5.342869162810265, "step": 6350}, {"loss": 1.3032, "grad_norm": 0.9535723924636841, "learning_rate": 0.0002, "epoch": 5.351283129995793, "step": 6360}, {"loss": 1.2908, "grad_norm": 0.9020892381668091, "learning_rate": 0.0002, "epoch": 5.359697097181321, "step": 6370}, {"loss": 1.2023, "grad_norm": 1.0626472234725952, "learning_rate": 0.0002, "epoch": 5.368111064366849, "step": 6380}, {"loss": 1.2555, "grad_norm": 1.1395848989486694, "learning_rate": 0.0002, "epoch": 5.376525031552377, "step": 6390}, {"loss": 1.2839, "grad_norm": 0.9274451732635498, "learning_rate": 0.0002, "epoch": 5.384938998737905, "step": 6400}, {"loss": 1.2819, "grad_norm": 0.8108699917793274, "learning_rate": 0.0002, "epoch": 5.393352965923433, "step": 6410}, {"loss": 1.2589, "grad_norm": 1.1805564165115356, "learning_rate": 0.0002, "epoch": 5.401766933108961, "step": 6420}, {"loss": 1.3549, "grad_norm": 0.8321298360824585, "learning_rate": 0.0002, "epoch": 5.410180900294489, "step": 6430}, {"loss": 1.2925, "grad_norm": 0.8981925249099731, "learning_rate": 0.0002, "epoch": 5.418594867480017, "step": 6440}, {"loss": 1.258, "grad_norm": 1.0730986595153809, "learning_rate": 0.0002, "epoch": 5.4270088346655445, "step": 6450}, {"loss": 1.26, "grad_norm": 1.0584609508514404, "learning_rate": 0.0002, "epoch": 5.435422801851073, "step": 6460}, {"loss": 1.2847, "grad_norm": 1.0792299509048462, "learning_rate": 0.0002, "epoch": 5.443836769036601, "step": 6470}, {"loss": 1.2035, "grad_norm": 0.9101872444152832, "learning_rate": 0.0002, "epoch": 5.452250736222129, "step": 6480}, {"loss": 1.2574, "grad_norm": 0.9910100698471069, "learning_rate": 0.0002, "epoch": 5.460664703407657, "step": 6490}, {"loss": 1.3098, "grad_norm": 1.041412353515625, "learning_rate": 0.0002, "epoch": 5.469078670593185, "step": 6500}, {"loss": 1.2812, "grad_norm": 1.0091687440872192, "learning_rate": 0.0002, "epoch": 5.477492637778712, "step": 6510}, {"loss": 1.2523, "grad_norm": 0.8755383491516113, "learning_rate": 0.0002, "epoch": 5.48590660496424, "step": 6520}, {"loss": 1.3042, "grad_norm": 0.980212390422821, "learning_rate": 0.0002, "epoch": 5.4943205721497685, "step": 6530}, {"loss": 1.2873, "grad_norm": 0.9356869459152222, "learning_rate": 0.0002, "epoch": 5.502734539335297, "step": 6540}, {"loss": 1.2254, "grad_norm": 0.9008095264434814, "learning_rate": 0.0002, "epoch": 5.511148506520825, "step": 6550}, {"loss": 1.2818, "grad_norm": 0.8908938765525818, "learning_rate": 0.0002, "epoch": 5.519562473706353, "step": 6560}, {"loss": 1.2212, "grad_norm": 1.1423932313919067, "learning_rate": 0.0002, "epoch": 5.52797644089188, "step": 6570}, {"loss": 1.3039, "grad_norm": 1.0508161783218384, "learning_rate": 0.0002, "epoch": 5.536390408077408, "step": 6580}, {"loss": 1.2446, "grad_norm": 0.8357517719268799, "learning_rate": 0.0002, "epoch": 5.544804375262936, "step": 6590}, {"loss": 1.3037, "grad_norm": 0.9892540574073792, "learning_rate": 0.0002, "epoch": 5.5532183424484645, "step": 6600}, {"loss": 1.3028, "grad_norm": 1.0048326253890991, "learning_rate": 0.0002, "epoch": 5.561632309633993, "step": 6610}, {"loss": 1.2152, "grad_norm": 0.9801995158195496, "learning_rate": 0.0002, "epoch": 5.570046276819521, "step": 6620}, {"loss": 1.2606, "grad_norm": 0.9899214506149292, "learning_rate": 0.0002, "epoch": 5.578460244005049, "step": 6630}, {"loss": 1.2043, "grad_norm": 1.1911814212799072, "learning_rate": 0.0002, "epoch": 5.586874211190576, "step": 6640}, {"loss": 1.3458, "grad_norm": 1.0368894338607788, "learning_rate": 0.0002, "epoch": 5.595288178376104, "step": 6650}, {"loss": 1.2595, "grad_norm": 1.1248382329940796, "learning_rate": 0.0002, "epoch": 5.603702145561632, "step": 6660}, {"loss": 1.2548, "grad_norm": 0.9765539765357971, "learning_rate": 0.0002, "epoch": 5.61211611274716, "step": 6670}, {"loss": 1.3451, "grad_norm": 0.9810206890106201, "learning_rate": 0.0002, "epoch": 5.6205300799326885, "step": 6680}, {"loss": 1.2952, "grad_norm": 1.100386619567871, "learning_rate": 0.0002, "epoch": 5.628944047118217, "step": 6690}, {"loss": 1.2467, "grad_norm": 0.8824519515037537, "learning_rate": 0.0002, "epoch": 5.637358014303744, "step": 6700}, {"loss": 1.25, "grad_norm": 1.0864064693450928, "learning_rate": 0.0002, "epoch": 5.645771981489272, "step": 6710}, {"loss": 1.2479, "grad_norm": 1.1614511013031006, "learning_rate": 0.0002, "epoch": 5.6541859486748, "step": 6720}, {"loss": 1.2753, "grad_norm": 1.0762972831726074, "learning_rate": 0.0002, "epoch": 5.662599915860328, "step": 6730}, {"loss": 1.2741, "grad_norm": 0.9408974647521973, "learning_rate": 0.0002, "epoch": 5.671013883045856, "step": 6740}, {"loss": 1.2431, "grad_norm": 0.8906030058860779, "learning_rate": 0.0002, "epoch": 5.679427850231384, "step": 6750}, {"loss": 1.2643, "grad_norm": 0.9527303576469421, "learning_rate": 0.0002, "epoch": 5.687841817416912, "step": 6760}, {"loss": 1.322, "grad_norm": 0.9471196532249451, "learning_rate": 0.0002, "epoch": 5.69625578460244, "step": 6770}, {"loss": 1.2514, "grad_norm": 0.9186838865280151, "learning_rate": 0.0002, "epoch": 5.704669751787968, "step": 6780}, {"loss": 1.2347, "grad_norm": 0.9225441813468933, "learning_rate": 0.0002, "epoch": 5.713083718973496, "step": 6790}, {"loss": 1.1849, "grad_norm": 0.9712982773780823, "learning_rate": 0.0002, "epoch": 5.721497686159024, "step": 6800}, {"loss": 1.2431, "grad_norm": 1.0743170976638794, "learning_rate": 0.0002, "epoch": 5.729911653344552, "step": 6810}, {"loss": 1.2136, "grad_norm": 1.2738113403320312, "learning_rate": 0.0002, "epoch": 5.73832562053008, "step": 6820}, {"loss": 1.2176, "grad_norm": 0.9386790990829468, "learning_rate": 0.0002, "epoch": 5.7467395877156076, "step": 6830}, {"loss": 1.285, "grad_norm": 1.0817769765853882, "learning_rate": 0.0002, "epoch": 5.755153554901136, "step": 6840}, {"loss": 1.2247, "grad_norm": 1.1040263175964355, "learning_rate": 0.0002, "epoch": 5.763567522086664, "step": 6850}, {"loss": 1.2507, "grad_norm": 1.0656492710113525, "learning_rate": 0.0002, "epoch": 5.771981489272192, "step": 6860}, {"loss": 1.2999, "grad_norm": 0.9550157189369202, "learning_rate": 0.0002, "epoch": 5.78039545645772, "step": 6870}, {"loss": 1.3201, "grad_norm": 1.0130870342254639, "learning_rate": 0.0002, "epoch": 5.788809423643248, "step": 6880}, {"loss": 1.3392, "grad_norm": 1.0675787925720215, "learning_rate": 0.0002, "epoch": 5.797223390828776, "step": 6890}, {"loss": 1.2949, "grad_norm": 0.9537774920463562, "learning_rate": 0.0002, "epoch": 5.8056373580143035, "step": 6900}, {"loss": 1.2658, "grad_norm": 0.9640319347381592, "learning_rate": 0.0002, "epoch": 5.814051325199832, "step": 6910}, {"loss": 1.2199, "grad_norm": 0.8917992115020752, "learning_rate": 0.0002, "epoch": 5.82246529238536, "step": 6920}, {"loss": 1.373, "grad_norm": 0.9881822466850281, "learning_rate": 0.0002, "epoch": 5.830879259570888, "step": 6930}, {"loss": 1.323, "grad_norm": 0.9136882424354553, "learning_rate": 0.0002, "epoch": 5.839293226756416, "step": 6940}, {"loss": 1.3159, "grad_norm": 0.9086098074913025, "learning_rate": 0.0002, "epoch": 5.847707193941943, "step": 6950}, {"loss": 1.2624, "grad_norm": 0.9443018436431885, "learning_rate": 0.0002, "epoch": 5.856121161127471, "step": 6960}, {"loss": 1.3224, "grad_norm": 0.9915381669998169, "learning_rate": 0.0002, "epoch": 5.864535128312999, "step": 6970}, {"loss": 1.337, "grad_norm": 0.8939146995544434, "learning_rate": 0.0002, "epoch": 5.8729490954985275, "step": 6980}, {"loss": 1.2611, "grad_norm": 1.3672245740890503, "learning_rate": 0.0002, "epoch": 5.881363062684056, "step": 6990}, {"loss": 1.3012, "grad_norm": 1.0116257667541504, "learning_rate": 0.0002, "epoch": 5.889777029869584, "step": 7000}, {"loss": 1.3128, "grad_norm": 1.1561565399169922, "learning_rate": 0.0002, "epoch": 5.898190997055112, "step": 7010}, {"loss": 1.2301, "grad_norm": 0.9900678992271423, "learning_rate": 0.0002, "epoch": 5.906604964240639, "step": 7020}, {"loss": 1.2845, "grad_norm": 0.9297345876693726, "learning_rate": 0.0002, "epoch": 5.915018931426167, "step": 7030}, {"loss": 1.2317, "grad_norm": 0.9357825517654419, "learning_rate": 0.0002, "epoch": 5.923432898611695, "step": 7040}, {"loss": 1.2303, "grad_norm": 1.049317717552185, "learning_rate": 0.0002, "epoch": 5.931846865797223, "step": 7050}, {"loss": 1.3243, "grad_norm": 0.950633704662323, "learning_rate": 0.0002, "epoch": 5.9402608329827515, "step": 7060}, {"loss": 1.2758, "grad_norm": 0.854581892490387, "learning_rate": 0.0002, "epoch": 5.94867480016828, "step": 7070}, {"loss": 1.3252, "grad_norm": 0.9097039699554443, "learning_rate": 0.0002, "epoch": 5.957088767353808, "step": 7080}, {"loss": 1.291, "grad_norm": 0.9072173237800598, "learning_rate": 0.0002, "epoch": 5.965502734539335, "step": 7090}, {"loss": 1.2724, "grad_norm": 1.0470727682113647, "learning_rate": 0.0002, "epoch": 5.973916701724863, "step": 7100}, {"loss": 1.3324, "grad_norm": 1.2628462314605713, "learning_rate": 0.0002, "epoch": 5.982330668910391, "step": 7110}, {"loss": 1.2701, "grad_norm": 1.055279016494751, "learning_rate": 0.0002, "epoch": 5.990744636095919, "step": 7120}, {"loss": 1.3234, "grad_norm": 0.966194212436676, "learning_rate": 0.0002, "epoch": 5.9991586032814475, "step": 7130}, {"eval_loss": 2.0427448749542236, "eval_runtime": 37.8426, "eval_samples_per_second": 13.609, "eval_steps_per_second": 1.718, "epoch": 6.0, "step": 7131}, {"loss": 1.1308, "grad_norm": 1.4037928581237793, "learning_rate": 0.0002, "epoch": 6.007572570466976, "step": 7140}, {"loss": 1.047, "grad_norm": 1.1081010103225708, "learning_rate": 0.0002, "epoch": 6.015986537652503, "step": 7150}, {"loss": 1.1368, "grad_norm": 1.1585499048233032, "learning_rate": 0.0002, "epoch": 6.024400504838031, "step": 7160}, {"loss": 1.0192, "grad_norm": 1.0822780132293701, "learning_rate": 0.0002, "epoch": 6.032814472023559, "step": 7170}, {"loss": 1.0755, "grad_norm": 0.9662094712257385, "learning_rate": 0.0002, "epoch": 6.041228439209087, "step": 7180}, {"loss": 1.1366, "grad_norm": 1.063936710357666, "learning_rate": 0.0002, "epoch": 6.049642406394615, "step": 7190}, {"loss": 1.0121, "grad_norm": 1.0349032878875732, "learning_rate": 0.0002, "epoch": 6.058056373580143, "step": 7200}, {"loss": 1.0591, "grad_norm": 1.0312575101852417, "learning_rate": 0.0002, "epoch": 6.066470340765671, "step": 7210}, {"loss": 1.1824, "grad_norm": 1.1942846775054932, "learning_rate": 0.0002, "epoch": 6.074884307951199, "step": 7220}, {"loss": 1.1034, "grad_norm": 1.0816049575805664, "learning_rate": 0.0002, "epoch": 6.083298275136727, "step": 7230}, {"loss": 1.0859, "grad_norm": 0.9985513687133789, "learning_rate": 0.0002, "epoch": 6.091712242322255, "step": 7240}, {"loss": 1.0367, "grad_norm": 1.2573972940444946, "learning_rate": 0.0002, "epoch": 6.100126209507783, "step": 7250}, {"loss": 1.1051, "grad_norm": 1.1182395219802856, "learning_rate": 0.0002, "epoch": 6.108540176693311, "step": 7260}, {"loss": 1.1219, "grad_norm": 0.9679344296455383, "learning_rate": 0.0002, "epoch": 6.116954143878839, "step": 7270}, {"loss": 1.1192, "grad_norm": 1.0913981199264526, "learning_rate": 0.0002, "epoch": 6.1253681110643665, "step": 7280}, {"loss": 1.0411, "grad_norm": 1.1291013956069946, "learning_rate": 0.0002, "epoch": 6.133782078249895, "step": 7290}, {"loss": 1.0963, "grad_norm": 1.2679595947265625, "learning_rate": 0.0002, "epoch": 6.142196045435423, "step": 7300}, {"loss": 1.0875, "grad_norm": 1.2350026369094849, "learning_rate": 0.0002, "epoch": 6.150610012620951, "step": 7310}, {"loss": 1.1139, "grad_norm": 1.3213104009628296, "learning_rate": 0.0002, "epoch": 6.159023979806479, "step": 7320}, {"loss": 1.1167, "grad_norm": 1.1924850940704346, "learning_rate": 0.0002, "epoch": 6.167437946992007, "step": 7330}, {"loss": 1.1242, "grad_norm": 1.1890000104904175, "learning_rate": 0.0002, "epoch": 6.175851914177534, "step": 7340}, {"loss": 1.1341, "grad_norm": 1.3821455240249634, "learning_rate": 0.0002, "epoch": 6.184265881363062, "step": 7350}, {"loss": 1.0748, "grad_norm": 1.1217057704925537, "learning_rate": 0.0002, "epoch": 6.1926798485485905, "step": 7360}, {"loss": 1.159, "grad_norm": 1.2441548109054565, "learning_rate": 0.0002, "epoch": 6.201093815734119, "step": 7370}, {"loss": 1.1199, "grad_norm": 1.0837615728378296, "learning_rate": 0.0002, "epoch": 6.209507782919647, "step": 7380}, {"loss": 1.1641, "grad_norm": 1.164304256439209, "learning_rate": 0.0002, "epoch": 6.217921750105175, "step": 7390}, {"loss": 1.1325, "grad_norm": 1.3129467964172363, "learning_rate": 0.0002, "epoch": 6.226335717290702, "step": 7400}, {"loss": 1.1537, "grad_norm": 1.1938153505325317, "learning_rate": 0.0002, "epoch": 6.23474968447623, "step": 7410}, {"loss": 1.1238, "grad_norm": 1.4348443746566772, "learning_rate": 0.0002, "epoch": 6.243163651661758, "step": 7420}, {"loss": 1.0778, "grad_norm": 1.132301926612854, "learning_rate": 0.0002, "epoch": 6.2515776188472865, "step": 7430}, {"loss": 1.1148, "grad_norm": 1.136966586112976, "learning_rate": 0.0002, "epoch": 6.259991586032815, "step": 7440}, {"loss": 1.096, "grad_norm": 1.12801194190979, "learning_rate": 0.0002, "epoch": 6.268405553218343, "step": 7450}, {"loss": 1.0408, "grad_norm": 1.0246902704238892, "learning_rate": 0.0002, "epoch": 6.276819520403871, "step": 7460}, {"loss": 1.0389, "grad_norm": 1.1066974401474, "learning_rate": 0.0002, "epoch": 6.285233487589398, "step": 7470}, {"loss": 1.1589, "grad_norm": 1.012710690498352, "learning_rate": 0.0002, "epoch": 6.293647454774926, "step": 7480}, {"loss": 1.1049, "grad_norm": 1.2227119207382202, "learning_rate": 0.0002, "epoch": 6.302061421960454, "step": 7490}, {"loss": 1.1376, "grad_norm": 0.9736923575401306, "learning_rate": 0.0002, "epoch": 6.310475389145982, "step": 7500}, {"loss": 1.1017, "grad_norm": 1.2945268154144287, "learning_rate": 0.0002, "epoch": 6.3188893563315105, "step": 7510}, {"loss": 1.0724, "grad_norm": 1.1579312086105347, "learning_rate": 0.0002, "epoch": 6.327303323517039, "step": 7520}, {"loss": 1.0899, "grad_norm": 1.2404558658599854, "learning_rate": 0.0002, "epoch": 6.335717290702567, "step": 7530}, {"loss": 1.1635, "grad_norm": 1.4673258066177368, "learning_rate": 0.0002, "epoch": 6.344131257888094, "step": 7540}, {"loss": 1.128, "grad_norm": 1.2268997430801392, "learning_rate": 0.0002, "epoch": 6.352545225073622, "step": 7550}, {"loss": 1.0932, "grad_norm": 0.9772747159004211, "learning_rate": 0.0002, "epoch": 6.36095919225915, "step": 7560}, {"loss": 1.1214, "grad_norm": 1.0205204486846924, "learning_rate": 0.0002, "epoch": 6.369373159444678, "step": 7570}, {"loss": 1.1095, "grad_norm": 1.2227109670639038, "learning_rate": 0.0002, "epoch": 6.377787126630206, "step": 7580}, {"loss": 1.1115, "grad_norm": 1.0708507299423218, "learning_rate": 0.0002, "epoch": 6.3862010938157345, "step": 7590}, {"loss": 1.1018, "grad_norm": 1.1427522897720337, "learning_rate": 0.0002, "epoch": 6.394615061001262, "step": 7600}, {"loss": 1.1079, "grad_norm": 1.0706431865692139, "learning_rate": 0.0002, "epoch": 6.40302902818679, "step": 7610}, {"loss": 1.0933, "grad_norm": 1.1358282566070557, "learning_rate": 0.0002, "epoch": 6.411442995372318, "step": 7620}, {"loss": 1.1075, "grad_norm": 1.4011822938919067, "learning_rate": 0.0002, "epoch": 6.419856962557846, "step": 7630}, {"loss": 1.1269, "grad_norm": 1.5616450309753418, "learning_rate": 0.0002, "epoch": 6.428270929743374, "step": 7640}, {"loss": 1.0953, "grad_norm": 1.1442687511444092, "learning_rate": 0.0002, "epoch": 6.436684896928902, "step": 7650}, {"loss": 1.1341, "grad_norm": 1.164803147315979, "learning_rate": 0.0002, "epoch": 6.44509886411443, "step": 7660}, {"loss": 1.14, "grad_norm": 1.3184553384780884, "learning_rate": 0.0002, "epoch": 6.453512831299958, "step": 7670}, {"loss": 1.1526, "grad_norm": 1.2701894044876099, "learning_rate": 0.0002, "epoch": 6.461926798485486, "step": 7680}, {"loss": 1.2119, "grad_norm": 1.1998416185379028, "learning_rate": 0.0002, "epoch": 6.470340765671014, "step": 7690}, {"loss": 1.1528, "grad_norm": 1.156459927558899, "learning_rate": 0.0002, "epoch": 6.478754732856542, "step": 7700}, {"loss": 1.2122, "grad_norm": 1.0217190980911255, "learning_rate": 0.0002, "epoch": 6.48716870004207, "step": 7710}, {"loss": 1.0917, "grad_norm": 1.230372428894043, "learning_rate": 0.0002, "epoch": 6.495582667227598, "step": 7720}, {"loss": 1.119, "grad_norm": 1.105675220489502, "learning_rate": 0.0002, "epoch": 6.5039966344131255, "step": 7730}, {"loss": 1.0758, "grad_norm": 1.1623669862747192, "learning_rate": 0.0002, "epoch": 6.512410601598654, "step": 7740}, {"loss": 1.1548, "grad_norm": 1.2884684801101685, "learning_rate": 0.0002, "epoch": 6.520824568784182, "step": 7750}, {"loss": 1.142, "grad_norm": 1.1785279512405396, "learning_rate": 0.0002, "epoch": 6.52923853596971, "step": 7760}, {"loss": 1.1598, "grad_norm": 1.0607101917266846, "learning_rate": 0.0002, "epoch": 6.537652503155238, "step": 7770}, {"loss": 1.1472, "grad_norm": 1.21990168094635, "learning_rate": 0.0002, "epoch": 6.546066470340766, "step": 7780}, {"loss": 1.1468, "grad_norm": 1.1498621702194214, "learning_rate": 0.0002, "epoch": 6.554480437526293, "step": 7790}, {"loss": 1.1847, "grad_norm": 1.263929009437561, "learning_rate": 0.0002, "epoch": 6.562894404711821, "step": 7800}, {"loss": 1.1177, "grad_norm": 1.1580625772476196, "learning_rate": 0.0002, "epoch": 6.5713083718973495, "step": 7810}, {"loss": 1.1313, "grad_norm": 1.4431294202804565, "learning_rate": 0.0002, "epoch": 6.579722339082878, "step": 7820}, {"loss": 1.1944, "grad_norm": 1.1309990882873535, "learning_rate": 0.0002, "epoch": 6.588136306268406, "step": 7830}, {"loss": 1.1156, "grad_norm": 1.0543386936187744, "learning_rate": 0.0002, "epoch": 6.596550273453934, "step": 7840}, {"loss": 1.0945, "grad_norm": 1.2180639505386353, "learning_rate": 0.0002, "epoch": 6.604964240639461, "step": 7850}, {"loss": 1.1318, "grad_norm": 1.0631271600723267, "learning_rate": 0.0002, "epoch": 6.613378207824989, "step": 7860}, {"loss": 1.1792, "grad_norm": 1.138885498046875, "learning_rate": 0.0002, "epoch": 6.621792175010517, "step": 7870}, {"loss": 1.1805, "grad_norm": 1.1117745637893677, "learning_rate": 0.0002, "epoch": 6.630206142196045, "step": 7880}, {"loss": 1.15, "grad_norm": 1.3734886646270752, "learning_rate": 0.0002, "epoch": 6.6386201093815735, "step": 7890}, {"loss": 1.1584, "grad_norm": 1.236003041267395, "learning_rate": 0.0002, "epoch": 6.647034076567102, "step": 7900}, {"loss": 1.1718, "grad_norm": 1.2206000089645386, "learning_rate": 0.0002, "epoch": 6.65544804375263, "step": 7910}, {"loss": 1.1637, "grad_norm": 1.2842656373977661, "learning_rate": 0.0002, "epoch": 6.663862010938157, "step": 7920}, {"loss": 1.2219, "grad_norm": 1.2365005016326904, "learning_rate": 0.0002, "epoch": 6.672275978123685, "step": 7930}, {"loss": 1.0827, "grad_norm": 1.256620168685913, "learning_rate": 0.0002, "epoch": 6.680689945309213, "step": 7940}, {"loss": 1.1788, "grad_norm": 1.3232917785644531, "learning_rate": 0.0002, "epoch": 6.689103912494741, "step": 7950}, {"loss": 1.2042, "grad_norm": 1.2470088005065918, "learning_rate": 0.0002, "epoch": 6.6975178796802695, "step": 7960}, {"loss": 1.0959, "grad_norm": 1.0511926412582397, "learning_rate": 0.0002, "epoch": 6.705931846865798, "step": 7970}, {"loss": 1.118, "grad_norm": 1.107310175895691, "learning_rate": 0.0002, "epoch": 6.714345814051326, "step": 7980}, {"loss": 1.2109, "grad_norm": 1.4069843292236328, "learning_rate": 0.0002, "epoch": 6.722759781236853, "step": 7990}, {"loss": 1.1298, "grad_norm": 1.0800836086273193, "learning_rate": 0.0002, "epoch": 6.731173748422381, "step": 8000}, {"loss": 1.1824, "grad_norm": 1.1676300764083862, "learning_rate": 0.0002, "epoch": 6.739587715607909, "step": 8010}, {"loss": 1.1253, "grad_norm": 1.0579663515090942, "learning_rate": 0.0002, "epoch": 6.748001682793437, "step": 8020}, {"loss": 1.1542, "grad_norm": 1.2770029306411743, "learning_rate": 0.0002, "epoch": 6.756415649978965, "step": 8030}, {"loss": 1.1519, "grad_norm": 1.0981038808822632, "learning_rate": 0.0002, "epoch": 6.764829617164493, "step": 8040}, {"loss": 1.1422, "grad_norm": 1.1194742918014526, "learning_rate": 0.0002, "epoch": 6.773243584350021, "step": 8050}, {"loss": 1.1463, "grad_norm": 1.0130012035369873, "learning_rate": 0.0002, "epoch": 6.781657551535549, "step": 8060}, {"loss": 1.2008, "grad_norm": 1.2051167488098145, "learning_rate": 0.0002, "epoch": 6.790071518721077, "step": 8070}, {"loss": 1.142, "grad_norm": 1.095689058303833, "learning_rate": 0.0002, "epoch": 6.798485485906605, "step": 8080}, {"loss": 1.1352, "grad_norm": 1.2275174856185913, "learning_rate": 0.0002, "epoch": 6.806899453092133, "step": 8090}, {"loss": 1.1453, "grad_norm": 1.1439805030822754, "learning_rate": 0.0002, "epoch": 6.815313420277661, "step": 8100}, {"loss": 1.1624, "grad_norm": 1.276331901550293, "learning_rate": 0.0002, "epoch": 6.8237273874631885, "step": 8110}, {"loss": 1.1686, "grad_norm": 1.0450139045715332, "learning_rate": 0.0002, "epoch": 6.832141354648717, "step": 8120}, {"loss": 1.1783, "grad_norm": 1.1189453601837158, "learning_rate": 0.0002, "epoch": 6.840555321834245, "step": 8130}, {"loss": 1.1093, "grad_norm": 1.194640874862671, "learning_rate": 0.0002, "epoch": 6.848969289019773, "step": 8140}, {"loss": 1.1559, "grad_norm": 1.095372200012207, "learning_rate": 0.0002, "epoch": 6.857383256205301, "step": 8150}, {"loss": 1.165, "grad_norm": 1.2416104078292847, "learning_rate": 0.0002, "epoch": 6.865797223390829, "step": 8160}, {"loss": 1.2174, "grad_norm": 1.2402868270874023, "learning_rate": 0.0002, "epoch": 6.874211190576357, "step": 8170}, {"loss": 1.1306, "grad_norm": 1.1317291259765625, "learning_rate": 0.0002, "epoch": 6.882625157761884, "step": 8180}, {"loss": 1.1944, "grad_norm": 1.0581914186477661, "learning_rate": 0.0002, "epoch": 6.8910391249474126, "step": 8190}, {"loss": 1.1271, "grad_norm": 1.3540890216827393, "learning_rate": 0.0002, "epoch": 6.899453092132941, "step": 8200}, {"loss": 1.2119, "grad_norm": 1.213672399520874, "learning_rate": 0.0002, "epoch": 6.907867059318469, "step": 8210}, {"loss": 1.1406, "grad_norm": 1.2654485702514648, "learning_rate": 0.0002, "epoch": 6.916281026503997, "step": 8220}, {"loss": 1.205, "grad_norm": 1.203903317451477, "learning_rate": 0.0002, "epoch": 6.924694993689524, "step": 8230}, {"loss": 1.1635, "grad_norm": 1.1332030296325684, "learning_rate": 0.0002, "epoch": 6.933108960875052, "step": 8240}, {"loss": 1.1148, "grad_norm": 1.2699192762374878, "learning_rate": 0.0002, "epoch": 6.94152292806058, "step": 8250}, {"loss": 1.1831, "grad_norm": 1.2728958129882812, "learning_rate": 0.0002, "epoch": 6.9499368952461085, "step": 8260}, {"loss": 1.1757, "grad_norm": 1.238410472869873, "learning_rate": 0.0002, "epoch": 6.958350862431637, "step": 8270}, {"loss": 1.1499, "grad_norm": 1.403863549232483, "learning_rate": 0.0002, "epoch": 6.966764829617165, "step": 8280}, {"loss": 1.1515, "grad_norm": 1.1096396446228027, "learning_rate": 0.0002, "epoch": 6.975178796802693, "step": 8290}, {"loss": 1.2049, "grad_norm": 1.1043379306793213, "learning_rate": 0.0002, "epoch": 6.98359276398822, "step": 8300}, {"loss": 1.1255, "grad_norm": 1.391754388809204, "learning_rate": 0.0002, "epoch": 6.992006731173748, "step": 8310}]} +{"epoch": 7.996634413125789, "step": 9504, "epoch_duration": 1245.9935760498047, "total_accumulated_duration": 11061.152487277985, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-42/checkpoint-2377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.56, "grad_norm": 0.5458821654319763, "learning_rate": 0.0002, "epoch": 0.008413967185527976, "step": 10}, {"loss": 2.3235, "grad_norm": 0.7293308973312378, "learning_rate": 0.0002, "epoch": 0.016827934371055953, "step": 20}, {"loss": 2.0815, "grad_norm": 0.47792306542396545, "learning_rate": 0.0002, "epoch": 0.02524190155658393, "step": 30}, {"loss": 1.9718, "grad_norm": 0.5944402813911438, "learning_rate": 0.0002, "epoch": 0.033655868742111905, "step": 40}, {"loss": 1.8848, "grad_norm": 0.5415359735488892, "learning_rate": 0.0002, "epoch": 0.04206983592763988, "step": 50}, {"loss": 1.8953, "grad_norm": 0.535713791847229, "learning_rate": 0.0002, "epoch": 0.05048380311316786, "step": 60}, {"loss": 1.937, "grad_norm": 0.5184146761894226, "learning_rate": 0.0002, "epoch": 0.058897770298695834, "step": 70}, {"loss": 1.8396, "grad_norm": 0.458926796913147, "learning_rate": 0.0002, "epoch": 0.06731173748422381, "step": 80}, {"loss": 1.8677, "grad_norm": 0.4780142307281494, "learning_rate": 0.0002, "epoch": 0.07572570466975179, "step": 90}, {"loss": 1.8593, "grad_norm": 0.79965740442276, "learning_rate": 0.0002, "epoch": 0.08413967185527976, "step": 100}, {"loss": 1.9081, "grad_norm": 0.4498862028121948, "learning_rate": 0.0002, "epoch": 0.09255363904080774, "step": 110}, {"loss": 1.8503, "grad_norm": 0.39338430762290955, "learning_rate": 0.0002, "epoch": 0.10096760622633572, "step": 120}, {"loss": 1.8637, "grad_norm": 0.9588953852653503, "learning_rate": 0.0002, "epoch": 0.10938157341186369, "step": 130}, {"loss": 1.8676, "grad_norm": 0.41675639152526855, "learning_rate": 0.0002, "epoch": 0.11779554059739167, "step": 140}, {"loss": 1.8904, "grad_norm": 0.44519832730293274, "learning_rate": 0.0002, "epoch": 0.12620950778291964, "step": 150}, {"loss": 1.798, "grad_norm": 0.4176260530948639, "learning_rate": 0.0002, "epoch": 0.13462347496844762, "step": 160}, {"loss": 1.8398, "grad_norm": 0.35840365290641785, "learning_rate": 0.0002, "epoch": 0.1430374421539756, "step": 170}, {"loss": 1.8666, "grad_norm": 0.3794495463371277, "learning_rate": 0.0002, "epoch": 0.15145140933950357, "step": 180}, {"loss": 1.8111, "grad_norm": 0.4563522934913635, "learning_rate": 0.0002, "epoch": 0.15986537652503155, "step": 190}, {"loss": 1.8893, "grad_norm": 0.37057486176490784, "learning_rate": 0.0002, "epoch": 0.16827934371055953, "step": 200}, {"loss": 1.7995, "grad_norm": 0.44081518054008484, "learning_rate": 0.0002, "epoch": 0.1766933108960875, "step": 210}, {"loss": 1.9048, "grad_norm": 0.46078577637672424, "learning_rate": 0.0002, "epoch": 0.18510727808161548, "step": 220}, {"loss": 1.8403, "grad_norm": 0.36132094264030457, "learning_rate": 0.0002, "epoch": 0.19352124526714345, "step": 230}, {"loss": 1.8827, "grad_norm": 0.3747289180755615, "learning_rate": 0.0002, "epoch": 0.20193521245267143, "step": 240}, {"loss": 1.8382, "grad_norm": 0.3540179133415222, "learning_rate": 0.0002, "epoch": 0.2103491796381994, "step": 250}, {"loss": 1.8196, "grad_norm": 0.3461375832557678, "learning_rate": 0.0002, "epoch": 0.21876314682372738, "step": 260}, {"loss": 1.8509, "grad_norm": 0.3436960279941559, "learning_rate": 0.0002, "epoch": 0.22717711400925536, "step": 270}, {"loss": 1.8285, "grad_norm": 0.35403719544410706, "learning_rate": 0.0002, "epoch": 0.23559108119478334, "step": 280}, {"loss": 1.8369, "grad_norm": 0.37142616510391235, "learning_rate": 0.0002, "epoch": 0.2440050483803113, "step": 290}, {"loss": 1.8044, "grad_norm": 0.3307955861091614, "learning_rate": 0.0002, "epoch": 0.2524190155658393, "step": 300}, {"loss": 1.817, "grad_norm": 0.32855314016342163, "learning_rate": 0.0002, "epoch": 0.2608329827513673, "step": 310}, {"loss": 1.7803, "grad_norm": 0.3299003839492798, "learning_rate": 0.0002, "epoch": 0.26924694993689524, "step": 320}, {"loss": 1.8129, "grad_norm": 0.44311287999153137, "learning_rate": 0.0002, "epoch": 0.27766091712242325, "step": 330}, {"loss": 1.8232, "grad_norm": 0.32989758253097534, "learning_rate": 0.0002, "epoch": 0.2860748843079512, "step": 340}, {"loss": 1.7716, "grad_norm": 0.34400200843811035, "learning_rate": 0.0002, "epoch": 0.2944888514934792, "step": 350}, {"loss": 1.7619, "grad_norm": 0.36286211013793945, "learning_rate": 0.0002, "epoch": 0.30290281867900715, "step": 360}, {"loss": 1.8025, "grad_norm": 0.406827837228775, "learning_rate": 0.0002, "epoch": 0.31131678586453515, "step": 370}, {"loss": 1.7515, "grad_norm": 0.36299195885658264, "learning_rate": 0.0002, "epoch": 0.3197307530500631, "step": 380}, {"loss": 1.837, "grad_norm": 0.3477257192134857, "learning_rate": 0.0002, "epoch": 0.3281447202355911, "step": 390}, {"loss": 1.7767, "grad_norm": 0.3730369210243225, "learning_rate": 0.0002, "epoch": 0.33655868742111905, "step": 400}, {"loss": 1.7747, "grad_norm": 0.4644559919834137, "learning_rate": 0.0002, "epoch": 0.34497265460664706, "step": 410}, {"loss": 1.7538, "grad_norm": 0.406576544046402, "learning_rate": 0.0002, "epoch": 0.353386621792175, "step": 420}, {"loss": 1.7501, "grad_norm": 0.3612699508666992, "learning_rate": 0.0002, "epoch": 0.361800588977703, "step": 430}, {"loss": 1.7473, "grad_norm": 0.3243742287158966, "learning_rate": 0.0002, "epoch": 0.37021455616323096, "step": 440}, {"loss": 1.8851, "grad_norm": 0.36671221256256104, "learning_rate": 0.0002, "epoch": 0.37862852334875896, "step": 450}, {"loss": 1.8853, "grad_norm": 0.3565002381801605, "learning_rate": 0.0002, "epoch": 0.3870424905342869, "step": 460}, {"loss": 1.8923, "grad_norm": 0.34630221128463745, "learning_rate": 0.0002, "epoch": 0.3954564577198149, "step": 470}, {"loss": 1.8234, "grad_norm": 0.3353537321090698, "learning_rate": 0.0002, "epoch": 0.40387042490534286, "step": 480}, {"loss": 1.7135, "grad_norm": 0.4015921950340271, "learning_rate": 0.0002, "epoch": 0.41228439209087087, "step": 490}, {"loss": 1.7815, "grad_norm": 0.5489419102668762, "learning_rate": 0.0002, "epoch": 0.4206983592763988, "step": 500}, {"loss": 1.7903, "grad_norm": 0.4193589985370636, "learning_rate": 0.0002, "epoch": 0.4291123264619268, "step": 510}, {"loss": 1.8416, "grad_norm": 0.3418922424316406, "learning_rate": 0.0002, "epoch": 0.43752629364745477, "step": 520}, {"loss": 1.7982, "grad_norm": 0.32668185234069824, "learning_rate": 0.0002, "epoch": 0.44594026083298277, "step": 530}, {"loss": 1.7501, "grad_norm": 0.3094325661659241, "learning_rate": 0.0002, "epoch": 0.4543542280185107, "step": 540}, {"loss": 1.7438, "grad_norm": 0.3743017315864563, "learning_rate": 0.0002, "epoch": 0.4627681952040387, "step": 550}, {"loss": 1.8451, "grad_norm": 0.3295630216598511, "learning_rate": 0.0002, "epoch": 0.47118216238956667, "step": 560}, {"loss": 1.7529, "grad_norm": 1.6124513149261475, "learning_rate": 0.0002, "epoch": 0.4795961295750947, "step": 570}, {"loss": 1.8028, "grad_norm": 0.3245585858821869, "learning_rate": 0.0002, "epoch": 0.4880100967606226, "step": 580}, {"loss": 1.7976, "grad_norm": 0.3332934081554413, "learning_rate": 0.0002, "epoch": 0.49642406394615063, "step": 590}, {"loss": 1.7912, "grad_norm": 0.3836138844490051, "learning_rate": 0.0002, "epoch": 0.5048380311316786, "step": 600}, {"loss": 1.8347, "grad_norm": 0.32953888177871704, "learning_rate": 0.0002, "epoch": 0.5132519983172066, "step": 610}, {"loss": 1.7729, "grad_norm": 0.36291512846946716, "learning_rate": 0.0002, "epoch": 0.5216659655027346, "step": 620}, {"loss": 1.7758, "grad_norm": 0.3237783908843994, "learning_rate": 0.0002, "epoch": 0.5300799326882625, "step": 630}, {"loss": 1.8352, "grad_norm": 0.38882696628570557, "learning_rate": 0.0002, "epoch": 0.5384938998737905, "step": 640}, {"loss": 1.8624, "grad_norm": 0.37821972370147705, "learning_rate": 0.0002, "epoch": 0.5469078670593185, "step": 650}, {"loss": 1.8075, "grad_norm": 0.3556285500526428, "learning_rate": 0.0002, "epoch": 0.5553218342448465, "step": 660}, {"loss": 1.778, "grad_norm": 0.347499281167984, "learning_rate": 0.0002, "epoch": 0.5637358014303744, "step": 670}, {"loss": 1.8066, "grad_norm": 0.3176489472389221, "learning_rate": 0.0002, "epoch": 0.5721497686159024, "step": 680}, {"loss": 1.7257, "grad_norm": 0.30220088362693787, "learning_rate": 0.0002, "epoch": 0.5805637358014304, "step": 690}, {"loss": 1.8415, "grad_norm": 0.3711601793766022, "learning_rate": 0.0002, "epoch": 0.5889777029869584, "step": 700}, {"loss": 1.7906, "grad_norm": 0.3311759829521179, "learning_rate": 0.0002, "epoch": 0.5973916701724863, "step": 710}, {"loss": 1.7712, "grad_norm": 0.34824270009994507, "learning_rate": 0.0002, "epoch": 0.6058056373580143, "step": 720}, {"loss": 1.7954, "grad_norm": 0.29668381810188293, "learning_rate": 0.0002, "epoch": 0.6142196045435423, "step": 730}, {"loss": 1.8321, "grad_norm": 0.36087489128112793, "learning_rate": 0.0002, "epoch": 0.6226335717290703, "step": 740}, {"loss": 1.7956, "grad_norm": 0.31590089201927185, "learning_rate": 0.0002, "epoch": 0.6310475389145982, "step": 750}, {"loss": 1.7343, "grad_norm": 0.37632957100868225, "learning_rate": 0.0002, "epoch": 0.6394615061001262, "step": 760}, {"loss": 1.8499, "grad_norm": 0.3360748589038849, "learning_rate": 0.0002, "epoch": 0.6478754732856542, "step": 770}, {"loss": 1.8076, "grad_norm": 0.3420640528202057, "learning_rate": 0.0002, "epoch": 0.6562894404711822, "step": 780}, {"loss": 1.8353, "grad_norm": 0.5734959244728088, "learning_rate": 0.0002, "epoch": 0.6647034076567101, "step": 790}, {"loss": 1.7746, "grad_norm": 0.36440837383270264, "learning_rate": 0.0002, "epoch": 0.6731173748422381, "step": 800}, {"loss": 1.7532, "grad_norm": 0.3179708421230316, "learning_rate": 0.0002, "epoch": 0.6815313420277661, "step": 810}, {"loss": 1.7815, "grad_norm": 0.34122881293296814, "learning_rate": 0.0002, "epoch": 0.6899453092132941, "step": 820}, {"loss": 1.8167, "grad_norm": 0.31886112689971924, "learning_rate": 0.0002, "epoch": 0.698359276398822, "step": 830}, {"loss": 1.7505, "grad_norm": 0.31782326102256775, "learning_rate": 0.0002, "epoch": 0.70677324358435, "step": 840}, {"loss": 1.7588, "grad_norm": 0.36052989959716797, "learning_rate": 0.0002, "epoch": 0.715187210769878, "step": 850}, {"loss": 1.7891, "grad_norm": 0.28946155309677124, "learning_rate": 0.0002, "epoch": 0.723601177955406, "step": 860}, {"loss": 1.7923, "grad_norm": 0.3095663785934448, "learning_rate": 0.0002, "epoch": 0.7320151451409339, "step": 870}, {"loss": 1.785, "grad_norm": 0.3317491412162781, "learning_rate": 0.0002, "epoch": 0.7404291123264619, "step": 880}, {"loss": 1.7709, "grad_norm": 0.31324660778045654, "learning_rate": 0.0002, "epoch": 0.7488430795119899, "step": 890}, {"loss": 1.8753, "grad_norm": 0.3290475606918335, "learning_rate": 0.0002, "epoch": 0.7572570466975179, "step": 900}, {"loss": 1.7679, "grad_norm": 0.35690343379974365, "learning_rate": 0.0002, "epoch": 0.7656710138830458, "step": 910}, {"loss": 1.826, "grad_norm": 0.39558273553848267, "learning_rate": 0.0002, "epoch": 0.7740849810685738, "step": 920}, {"loss": 1.8722, "grad_norm": 0.34254348278045654, "learning_rate": 0.0002, "epoch": 0.7824989482541018, "step": 930}, {"loss": 1.7603, "grad_norm": 0.3560165464878082, "learning_rate": 0.0002, "epoch": 0.7909129154396298, "step": 940}, {"loss": 1.7992, "grad_norm": 0.30693164467811584, "learning_rate": 0.0002, "epoch": 0.7993268826251577, "step": 950}, {"loss": 1.8029, "grad_norm": 0.3394823372364044, "learning_rate": 0.0002, "epoch": 0.8077408498106857, "step": 960}, {"loss": 1.8105, "grad_norm": 0.3741514980792999, "learning_rate": 0.0002, "epoch": 0.8161548169962137, "step": 970}, {"loss": 1.7849, "grad_norm": 0.3655228316783905, "learning_rate": 0.0002, "epoch": 0.8245687841817417, "step": 980}, {"loss": 1.8449, "grad_norm": 0.3586033880710602, "learning_rate": 0.0002, "epoch": 0.8329827513672696, "step": 990}, {"loss": 1.7033, "grad_norm": 0.3459678888320923, "learning_rate": 0.0002, "epoch": 0.8413967185527976, "step": 1000}, {"loss": 1.8498, "grad_norm": 0.3184349834918976, "learning_rate": 0.0002, "epoch": 0.8498106857383256, "step": 1010}, {"loss": 1.7632, "grad_norm": 0.3099786043167114, "learning_rate": 0.0002, "epoch": 0.8582246529238536, "step": 1020}, {"loss": 1.8067, "grad_norm": 0.30300915241241455, "learning_rate": 0.0002, "epoch": 0.8666386201093815, "step": 1030}, {"loss": 1.7923, "grad_norm": 0.3128705620765686, "learning_rate": 0.0002, "epoch": 0.8750525872949095, "step": 1040}, {"loss": 1.8252, "grad_norm": 0.3336263597011566, "learning_rate": 0.0002, "epoch": 0.8834665544804375, "step": 1050}, {"loss": 1.8375, "grad_norm": 0.3801328241825104, "learning_rate": 0.0002, "epoch": 0.8918805216659655, "step": 1060}, {"loss": 1.7757, "grad_norm": 0.3122096359729767, "learning_rate": 0.0002, "epoch": 0.9002944888514934, "step": 1070}, {"loss": 1.8251, "grad_norm": 0.35990869998931885, "learning_rate": 0.0002, "epoch": 0.9087084560370214, "step": 1080}, {"loss": 1.7343, "grad_norm": 0.3321819305419922, "learning_rate": 0.0002, "epoch": 0.9171224232225494, "step": 1090}, {"loss": 1.7595, "grad_norm": 0.4202139377593994, "learning_rate": 0.0002, "epoch": 0.9255363904080774, "step": 1100}, {"loss": 1.8056, "grad_norm": 0.32559722661972046, "learning_rate": 0.0002, "epoch": 0.9339503575936053, "step": 1110}, {"loss": 1.812, "grad_norm": 0.3098459839820862, "learning_rate": 0.0002, "epoch": 0.9423643247791333, "step": 1120}, {"loss": 1.8252, "grad_norm": 0.33917108178138733, "learning_rate": 0.0002, "epoch": 0.9507782919646613, "step": 1130}, {"loss": 1.7709, "grad_norm": 0.4055837094783783, "learning_rate": 0.0002, "epoch": 0.9591922591501894, "step": 1140}, {"loss": 1.8259, "grad_norm": 0.32508623600006104, "learning_rate": 0.0002, "epoch": 0.9676062263357172, "step": 1150}, {"loss": 1.782, "grad_norm": 0.30150601267814636, "learning_rate": 0.0002, "epoch": 0.9760201935212452, "step": 1160}, {"loss": 1.8291, "grad_norm": 0.3042563199996948, "learning_rate": 0.0002, "epoch": 0.9844341607067733, "step": 1170}, {"loss": 1.7847, "grad_norm": 0.33254584670066833, "learning_rate": 0.0002, "epoch": 0.9928481278923013, "step": 1180}, {"eval_loss": 1.8077726364135742, "eval_runtime": 38.4359, "eval_samples_per_second": 13.399, "eval_steps_per_second": 1.691, "epoch": 0.9995793016407236, "step": 1188}, {"loss": 1.7414, "grad_norm": 0.35073035955429077, "learning_rate": 0.0002, "epoch": 1.0012620950778293, "step": 1190}, {"loss": 1.7483, "grad_norm": 0.3217269778251648, "learning_rate": 0.0002, "epoch": 1.0096760622633572, "step": 1200}, {"loss": 1.7517, "grad_norm": 0.3635033369064331, "learning_rate": 0.0002, "epoch": 1.018090029448885, "step": 1210}, {"loss": 1.6949, "grad_norm": 0.32468414306640625, "learning_rate": 0.0002, "epoch": 1.0265039966344132, "step": 1220}, {"loss": 1.711, "grad_norm": 0.3307163417339325, "learning_rate": 0.0002, "epoch": 1.034917963819941, "step": 1230}, {"loss": 1.7881, "grad_norm": 0.34381359815597534, "learning_rate": 0.0002, "epoch": 1.0433319310054692, "step": 1240}, {"loss": 1.612, "grad_norm": 0.35874804854393005, "learning_rate": 0.0002, "epoch": 1.051745898190997, "step": 1250}, {"loss": 1.7314, "grad_norm": 0.3615919351577759, "learning_rate": 0.0002, "epoch": 1.060159865376525, "step": 1260}, {"loss": 1.7517, "grad_norm": 0.32835808396339417, "learning_rate": 0.0002, "epoch": 1.068573832562053, "step": 1270}, {"loss": 1.7193, "grad_norm": 0.3876388370990753, "learning_rate": 0.0002, "epoch": 1.076987799747581, "step": 1280}, {"loss": 1.7442, "grad_norm": 0.39895930886268616, "learning_rate": 0.0002, "epoch": 1.0854017669331089, "step": 1290}, {"loss": 1.6601, "grad_norm": 0.39081698656082153, "learning_rate": 0.0002, "epoch": 1.093815734118637, "step": 1300}, {"loss": 1.7623, "grad_norm": 0.39974215626716614, "learning_rate": 0.0002, "epoch": 1.1022297013041649, "step": 1310}, {"loss": 1.7506, "grad_norm": 0.3887332081794739, "learning_rate": 0.0002, "epoch": 1.110643668489693, "step": 1320}, {"loss": 1.7381, "grad_norm": 0.36216408014297485, "learning_rate": 0.0002, "epoch": 1.1190576356752209, "step": 1330}, {"loss": 1.762, "grad_norm": 0.36979028582572937, "learning_rate": 0.0002, "epoch": 1.1274716028607488, "step": 1340}, {"loss": 1.7515, "grad_norm": 0.34052133560180664, "learning_rate": 0.0002, "epoch": 1.1358855700462769, "step": 1350}, {"loss": 1.7513, "grad_norm": 0.3467716574668884, "learning_rate": 0.0002, "epoch": 1.1442995372318048, "step": 1360}, {"loss": 1.7086, "grad_norm": 0.35528799891471863, "learning_rate": 0.0002, "epoch": 1.1527135044173327, "step": 1370}, {"loss": 1.794, "grad_norm": 0.36282262206077576, "learning_rate": 0.0002, "epoch": 1.1611274716028608, "step": 1380}, {"loss": 1.7731, "grad_norm": 0.37355899810791016, "learning_rate": 0.0002, "epoch": 1.1695414387883887, "step": 1390}, {"loss": 1.7483, "grad_norm": 0.37292736768722534, "learning_rate": 0.0002, "epoch": 1.1779554059739168, "step": 1400}, {"loss": 1.6916, "grad_norm": 0.5892812013626099, "learning_rate": 0.0002, "epoch": 1.1863693731594447, "step": 1410}, {"loss": 1.7302, "grad_norm": 0.3712292015552521, "learning_rate": 0.0002, "epoch": 1.1947833403449726, "step": 1420}, {"loss": 1.7709, "grad_norm": 0.3349577486515045, "learning_rate": 0.0002, "epoch": 1.2031973075305007, "step": 1430}, {"loss": 1.7412, "grad_norm": 0.32591062784194946, "learning_rate": 0.0002, "epoch": 1.2116112747160286, "step": 1440}, {"loss": 1.7406, "grad_norm": 0.3840635418891907, "learning_rate": 0.0002, "epoch": 1.2200252419015567, "step": 1450}, {"loss": 1.7276, "grad_norm": 0.37238365411758423, "learning_rate": 0.0002, "epoch": 1.2284392090870846, "step": 1460}, {"loss": 1.7052, "grad_norm": 0.3731217682361603, "learning_rate": 0.0002, "epoch": 1.2368531762726125, "step": 1470}, {"loss": 1.7255, "grad_norm": 0.3318967819213867, "learning_rate": 0.0002, "epoch": 1.2452671434581406, "step": 1480}, {"loss": 1.7463, "grad_norm": 0.3784034848213196, "learning_rate": 0.0002, "epoch": 1.2536811106436685, "step": 1490}, {"loss": 1.6862, "grad_norm": 0.3541383147239685, "learning_rate": 0.0002, "epoch": 1.2620950778291964, "step": 1500}, {"loss": 1.8394, "grad_norm": 0.35312485694885254, "learning_rate": 0.0002, "epoch": 1.2705090450147245, "step": 1510}, {"loss": 1.7029, "grad_norm": 0.35272929072380066, "learning_rate": 0.0002, "epoch": 1.2789230122002524, "step": 1520}, {"loss": 1.7016, "grad_norm": 0.40988272428512573, "learning_rate": 0.0002, "epoch": 1.2873369793857803, "step": 1530}, {"loss": 1.6912, "grad_norm": 0.3543946146965027, "learning_rate": 0.0002, "epoch": 1.2957509465713084, "step": 1540}, {"loss": 1.6757, "grad_norm": 0.35639145970344543, "learning_rate": 0.0002, "epoch": 1.3041649137568363, "step": 1550}, {"loss": 1.6814, "grad_norm": 0.3290826678276062, "learning_rate": 0.0002, "epoch": 1.3125788809423642, "step": 1560}, {"loss": 1.7369, "grad_norm": 0.39264336228370667, "learning_rate": 0.0002, "epoch": 1.3209928481278923, "step": 1570}, {"loss": 1.6804, "grad_norm": 0.5390415191650391, "learning_rate": 0.0002, "epoch": 1.3294068153134202, "step": 1580}, {"loss": 1.708, "grad_norm": 0.5188116431236267, "learning_rate": 0.0002, "epoch": 1.3378207824989483, "step": 1590}, {"loss": 1.6763, "grad_norm": 0.37445148825645447, "learning_rate": 0.0002, "epoch": 1.3462347496844762, "step": 1600}, {"loss": 1.7386, "grad_norm": 0.3296085298061371, "learning_rate": 0.0002, "epoch": 1.3546487168700043, "step": 1610}, {"loss": 1.8107, "grad_norm": 0.39879581332206726, "learning_rate": 0.0002, "epoch": 1.3630626840555322, "step": 1620}, {"loss": 1.6744, "grad_norm": 0.36092764139175415, "learning_rate": 0.0002, "epoch": 1.37147665124106, "step": 1630}, {"loss": 1.7144, "grad_norm": 0.37011823058128357, "learning_rate": 0.0002, "epoch": 1.3798906184265882, "step": 1640}, {"loss": 1.7396, "grad_norm": 0.40863534808158875, "learning_rate": 0.0002, "epoch": 1.3883045856121161, "step": 1650}, {"loss": 1.7901, "grad_norm": 0.337001770734787, "learning_rate": 0.0002, "epoch": 1.396718552797644, "step": 1660}, {"loss": 1.7044, "grad_norm": 0.35596707463264465, "learning_rate": 0.0002, "epoch": 1.4051325199831721, "step": 1670}, {"loss": 1.7717, "grad_norm": 0.3857671916484833, "learning_rate": 0.0002, "epoch": 1.4135464871687, "step": 1680}, {"loss": 1.7015, "grad_norm": 0.419502317905426, "learning_rate": 0.0002, "epoch": 1.421960454354228, "step": 1690}, {"loss": 1.7261, "grad_norm": 0.35459452867507935, "learning_rate": 0.0002, "epoch": 1.430374421539756, "step": 1700}, {"loss": 1.7361, "grad_norm": 0.37246978282928467, "learning_rate": 0.0002, "epoch": 1.438788388725284, "step": 1710}, {"loss": 1.6762, "grad_norm": 0.33091893792152405, "learning_rate": 0.0002, "epoch": 1.4472023559108118, "step": 1720}, {"loss": 1.7044, "grad_norm": 0.37029674649238586, "learning_rate": 0.0002, "epoch": 1.45561632309634, "step": 1730}, {"loss": 1.7117, "grad_norm": 0.374025821685791, "learning_rate": 0.0002, "epoch": 1.4640302902818678, "step": 1740}, {"loss": 1.7549, "grad_norm": 0.3416315019130707, "learning_rate": 0.0002, "epoch": 1.472444257467396, "step": 1750}, {"loss": 1.7093, "grad_norm": 0.36502841114997864, "learning_rate": 0.0002, "epoch": 1.4808582246529238, "step": 1760}, {"loss": 1.6597, "grad_norm": 0.35458803176879883, "learning_rate": 0.0002, "epoch": 1.489272191838452, "step": 1770}, {"loss": 1.675, "grad_norm": 0.4462839663028717, "learning_rate": 0.0002, "epoch": 1.4976861590239798, "step": 1780}, {"loss": 1.7267, "grad_norm": 0.34836092591285706, "learning_rate": 0.0002, "epoch": 1.5061001262095077, "step": 1790}, {"loss": 1.7295, "grad_norm": 0.3445749282836914, "learning_rate": 0.0002, "epoch": 1.5145140933950358, "step": 1800}, {"loss": 1.7386, "grad_norm": 0.36012160778045654, "learning_rate": 0.0002, "epoch": 1.5229280605805637, "step": 1810}, {"loss": 1.6594, "grad_norm": 0.4052616059780121, "learning_rate": 0.0002, "epoch": 1.5313420277660916, "step": 1820}, {"loss": 1.72, "grad_norm": 0.3966905474662781, "learning_rate": 0.0002, "epoch": 1.5397559949516197, "step": 1830}, {"loss": 1.7595, "grad_norm": 0.35028719902038574, "learning_rate": 0.0002, "epoch": 1.5481699621371476, "step": 1840}, {"loss": 1.6829, "grad_norm": 0.3936742842197418, "learning_rate": 0.0002, "epoch": 1.5565839293226755, "step": 1850}, {"loss": 1.7579, "grad_norm": 0.34473296999931335, "learning_rate": 0.0002, "epoch": 1.5649978965082036, "step": 1860}, {"loss": 1.7207, "grad_norm": 0.4328365623950958, "learning_rate": 0.0002, "epoch": 1.5734118636937318, "step": 1870}, {"loss": 1.7098, "grad_norm": 0.3566315472126007, "learning_rate": 0.0002, "epoch": 1.5818258308792594, "step": 1880}, {"loss": 1.6095, "grad_norm": 0.3301256597042084, "learning_rate": 0.0002, "epoch": 1.5902397980647875, "step": 1890}, {"loss": 1.748, "grad_norm": 0.3743041455745697, "learning_rate": 0.0002, "epoch": 1.5986537652503157, "step": 1900}, {"loss": 1.7259, "grad_norm": 0.3735344707965851, "learning_rate": 0.0002, "epoch": 1.6070677324358436, "step": 1910}, {"loss": 1.7445, "grad_norm": 0.42191144824028015, "learning_rate": 0.0002, "epoch": 1.6154816996213714, "step": 1920}, {"loss": 1.6978, "grad_norm": 0.3787207305431366, "learning_rate": 0.0002, "epoch": 1.6238956668068996, "step": 1930}, {"loss": 1.6893, "grad_norm": 0.35647350549697876, "learning_rate": 0.0002, "epoch": 1.6323096339924275, "step": 1940}, {"loss": 1.7825, "grad_norm": 0.39791446924209595, "learning_rate": 0.0002, "epoch": 1.6407236011779553, "step": 1950}, {"loss": 1.7293, "grad_norm": 0.37341275811195374, "learning_rate": 0.0002, "epoch": 1.6491375683634835, "step": 1960}, {"loss": 1.6781, "grad_norm": 0.3722686469554901, "learning_rate": 0.0002, "epoch": 1.6575515355490114, "step": 1970}, {"loss": 1.6383, "grad_norm": 0.37467387318611145, "learning_rate": 0.0002, "epoch": 1.6659655027345392, "step": 1980}, {"loss": 1.7439, "grad_norm": 0.37109461426734924, "learning_rate": 0.0002, "epoch": 1.6743794699200674, "step": 1990}, {"loss": 1.7206, "grad_norm": 0.4008837044239044, "learning_rate": 0.0002, "epoch": 1.6827934371055953, "step": 2000}, {"loss": 1.7604, "grad_norm": 0.3316999673843384, "learning_rate": 0.0002, "epoch": 1.6912074042911232, "step": 2010}, {"loss": 1.7325, "grad_norm": 0.3683805465698242, "learning_rate": 0.0002, "epoch": 1.6996213714766513, "step": 2020}, {"loss": 1.7451, "grad_norm": 0.4163658320903778, "learning_rate": 0.0002, "epoch": 1.7080353386621794, "step": 2030}, {"loss": 1.741, "grad_norm": 0.4245431125164032, "learning_rate": 0.0002, "epoch": 1.716449305847707, "step": 2040}, {"loss": 1.7184, "grad_norm": 0.36732038855552673, "learning_rate": 0.0002, "epoch": 1.7248632730332352, "step": 2050}, {"loss": 1.7031, "grad_norm": 0.34981656074523926, "learning_rate": 0.0002, "epoch": 1.7332772402187633, "step": 2060}, {"loss": 1.7545, "grad_norm": 0.38588812947273254, "learning_rate": 0.0002, "epoch": 1.7416912074042912, "step": 2070}, {"loss": 1.7728, "grad_norm": 0.39914557337760925, "learning_rate": 0.0002, "epoch": 1.750105174589819, "step": 2080}, {"loss": 1.7049, "grad_norm": 0.36068692803382874, "learning_rate": 0.0002, "epoch": 1.7585191417753472, "step": 2090}, {"loss": 1.7537, "grad_norm": 0.3983287215232849, "learning_rate": 0.0002, "epoch": 1.766933108960875, "step": 2100}, {"loss": 1.7016, "grad_norm": 0.45008400082588196, "learning_rate": 0.0002, "epoch": 1.775347076146403, "step": 2110}, {"loss": 1.7163, "grad_norm": 0.3618052303791046, "learning_rate": 0.0002, "epoch": 1.783761043331931, "step": 2120}, {"loss": 1.7335, "grad_norm": 0.38745400309562683, "learning_rate": 0.0002, "epoch": 1.792175010517459, "step": 2130}, {"loss": 1.7387, "grad_norm": 0.3413826525211334, "learning_rate": 0.0002, "epoch": 1.8005889777029869, "step": 2140}, {"loss": 1.7414, "grad_norm": 0.35983747243881226, "learning_rate": 0.0002, "epoch": 1.809002944888515, "step": 2150}, {"loss": 1.7892, "grad_norm": 0.40926849842071533, "learning_rate": 0.0002, "epoch": 1.8174169120740429, "step": 2160}, {"loss": 1.6823, "grad_norm": 0.3543093800544739, "learning_rate": 0.0002, "epoch": 1.8258308792595708, "step": 2170}, {"loss": 1.7812, "grad_norm": 0.42690935730934143, "learning_rate": 0.0002, "epoch": 1.8342448464450989, "step": 2180}, {"loss": 1.7471, "grad_norm": 0.40282756090164185, "learning_rate": 0.0002, "epoch": 1.842658813630627, "step": 2190}, {"loss": 1.7411, "grad_norm": 0.36568400263786316, "learning_rate": 0.0002, "epoch": 1.8510727808161547, "step": 2200}, {"loss": 1.7024, "grad_norm": 0.43159013986587524, "learning_rate": 0.0002, "epoch": 1.8594867480016828, "step": 2210}, {"loss": 1.7298, "grad_norm": 0.3554118573665619, "learning_rate": 0.0002, "epoch": 1.867900715187211, "step": 2220}, {"loss": 1.7157, "grad_norm": 0.43349072337150574, "learning_rate": 0.0002, "epoch": 1.8763146823727388, "step": 2230}, {"loss": 1.7302, "grad_norm": 0.36486536264419556, "learning_rate": 0.0002, "epoch": 1.8847286495582667, "step": 2240}, {"loss": 1.6901, "grad_norm": 0.39260047674179077, "learning_rate": 0.0002, "epoch": 1.8931426167437948, "step": 2250}, {"loss": 1.6691, "grad_norm": 0.3741776943206787, "learning_rate": 0.0002, "epoch": 1.9015565839293227, "step": 2260}, {"loss": 1.6931, "grad_norm": 0.3961946964263916, "learning_rate": 0.0002, "epoch": 1.9099705511148506, "step": 2270}, {"loss": 1.737, "grad_norm": 0.3659731149673462, "learning_rate": 0.0002, "epoch": 1.9183845183003787, "step": 2280}, {"loss": 1.7342, "grad_norm": 0.34744107723236084, "learning_rate": 0.0002, "epoch": 1.9267984854859066, "step": 2290}, {"loss": 1.7162, "grad_norm": 0.3607442378997803, "learning_rate": 0.0002, "epoch": 1.9352124526714345, "step": 2300}, {"loss": 1.6673, "grad_norm": 0.331464558839798, "learning_rate": 0.0002, "epoch": 1.9436264198569626, "step": 2310}, {"loss": 1.7101, "grad_norm": 0.3904414474964142, "learning_rate": 0.0002, "epoch": 1.9520403870424905, "step": 2320}, {"loss": 1.7327, "grad_norm": 0.37584832310676575, "learning_rate": 0.0002, "epoch": 1.9604543542280184, "step": 2330}, {"loss": 1.7586, "grad_norm": 0.3698684275150299, "learning_rate": 0.0002, "epoch": 1.9688683214135465, "step": 2340}, {"loss": 1.7764, "grad_norm": 0.40571412444114685, "learning_rate": 0.0002, "epoch": 1.9772822885990746, "step": 2350}, {"loss": 1.744, "grad_norm": 0.40059587359428406, "learning_rate": 0.0002, "epoch": 1.9856962557846023, "step": 2360}, {"loss": 1.7033, "grad_norm": 0.4168248474597931, "learning_rate": 0.0002, "epoch": 1.9941102229701304, "step": 2370}, {"eval_loss": 1.8055059909820557, "eval_runtime": 38.422, "eval_samples_per_second": 13.404, "eval_steps_per_second": 1.692, "epoch": 2.0, "step": 2377}, {"loss": 1.7673, "grad_norm": 0.35205352306365967, "learning_rate": 0.0002, "epoch": 2.0025241901556585, "step": 2380}, {"loss": 1.6556, "grad_norm": 0.3979377746582031, "learning_rate": 0.0002, "epoch": 2.010938157341186, "step": 2390}, {"loss": 1.6421, "grad_norm": 0.396491676568985, "learning_rate": 0.0002, "epoch": 2.0193521245267143, "step": 2400}, {"loss": 1.6847, "grad_norm": 0.44712209701538086, "learning_rate": 0.0002, "epoch": 2.0277660917122424, "step": 2410}, {"loss": 1.6877, "grad_norm": 0.4454420208930969, "learning_rate": 0.0002, "epoch": 2.03618005889777, "step": 2420}, {"loss": 1.6635, "grad_norm": 0.4170038402080536, "learning_rate": 0.0002, "epoch": 2.044594026083298, "step": 2430}, {"loss": 1.6512, "grad_norm": 0.4309595227241516, "learning_rate": 0.0002, "epoch": 2.0530079932688263, "step": 2440}, {"loss": 1.6223, "grad_norm": 0.4241602122783661, "learning_rate": 0.0002, "epoch": 2.0614219604543544, "step": 2450}, {"loss": 1.6162, "grad_norm": 0.4370540678501129, "learning_rate": 0.0002, "epoch": 2.069835927639882, "step": 2460}, {"loss": 1.6354, "grad_norm": 0.43985554575920105, "learning_rate": 0.0002, "epoch": 2.0782498948254102, "step": 2470}, {"loss": 1.6954, "grad_norm": 0.4158105254173279, "learning_rate": 0.0002, "epoch": 2.0866638620109383, "step": 2480}, {"loss": 1.6114, "grad_norm": 0.441549152135849, "learning_rate": 0.0002, "epoch": 2.095077829196466, "step": 2490}, {"loss": 1.5485, "grad_norm": 0.385718435049057, "learning_rate": 0.0002, "epoch": 2.103491796381994, "step": 2500}, {"loss": 1.5894, "grad_norm": 0.43146514892578125, "learning_rate": 0.0002, "epoch": 2.1119057635675222, "step": 2510}, {"loss": 1.6414, "grad_norm": 0.41663315892219543, "learning_rate": 0.0002, "epoch": 2.12031973075305, "step": 2520}, {"loss": 1.6527, "grad_norm": 0.4410698115825653, "learning_rate": 0.0002, "epoch": 2.128733697938578, "step": 2530}, {"loss": 1.6124, "grad_norm": 0.4472278952598572, "learning_rate": 0.0002, "epoch": 2.137147665124106, "step": 2540}, {"loss": 1.6257, "grad_norm": 0.3879167437553406, "learning_rate": 0.0002, "epoch": 2.145561632309634, "step": 2550}, {"loss": 1.6682, "grad_norm": 0.4212203025817871, "learning_rate": 0.0002, "epoch": 2.153975599495162, "step": 2560}, {"loss": 1.6036, "grad_norm": 0.42841723561286926, "learning_rate": 0.0002, "epoch": 2.16238956668069, "step": 2570}, {"loss": 1.5962, "grad_norm": 0.39272481203079224, "learning_rate": 0.0002, "epoch": 2.1708035338662177, "step": 2580}, {"loss": 1.681, "grad_norm": 0.4075261354446411, "learning_rate": 0.0002, "epoch": 2.179217501051746, "step": 2590}, {"loss": 1.6601, "grad_norm": 0.5358437895774841, "learning_rate": 0.0002, "epoch": 2.187631468237274, "step": 2600}, {"loss": 1.6423, "grad_norm": 0.4738350212574005, "learning_rate": 0.0002, "epoch": 2.1960454354228016, "step": 2610}, {"loss": 1.6386, "grad_norm": 0.446789026260376, "learning_rate": 0.0002, "epoch": 2.2044594026083297, "step": 2620}, {"loss": 1.6246, "grad_norm": 0.4615374505519867, "learning_rate": 0.0002, "epoch": 2.212873369793858, "step": 2630}, {"loss": 1.6205, "grad_norm": 0.46901994943618774, "learning_rate": 0.0002, "epoch": 2.221287336979386, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.46267789602279663, "learning_rate": 0.0002, "epoch": 2.2297013041649136, "step": 2650}, {"loss": 1.6584, "grad_norm": 0.4383080005645752, "learning_rate": 0.0002, "epoch": 2.2381152713504417, "step": 2660}, {"loss": 1.5745, "grad_norm": 0.4070609509944916, "learning_rate": 0.0002, "epoch": 2.24652923853597, "step": 2670}, {"loss": 1.6125, "grad_norm": 0.4572339951992035, "learning_rate": 0.0002, "epoch": 2.2549432057214975, "step": 2680}, {"loss": 1.5671, "grad_norm": 0.393265038728714, "learning_rate": 0.0002, "epoch": 2.2633571729070256, "step": 2690}, {"loss": 1.6239, "grad_norm": 0.46144717931747437, "learning_rate": 0.0002, "epoch": 2.2717711400925538, "step": 2700}, {"loss": 1.5992, "grad_norm": 0.45077767968177795, "learning_rate": 0.0002, "epoch": 2.2801851072780814, "step": 2710}, {"loss": 1.6261, "grad_norm": 0.5697639584541321, "learning_rate": 0.0002, "epoch": 2.2885990744636096, "step": 2720}, {"loss": 1.6192, "grad_norm": 0.4855510890483856, "learning_rate": 0.0002, "epoch": 2.2970130416491377, "step": 2730}, {"loss": 1.7419, "grad_norm": 0.4440622627735138, "learning_rate": 0.0002, "epoch": 2.3054270088346653, "step": 2740}, {"loss": 1.6496, "grad_norm": 0.3904096782207489, "learning_rate": 0.0002, "epoch": 2.3138409760201935, "step": 2750}, {"loss": 1.5888, "grad_norm": 0.5225510597229004, "learning_rate": 0.0002, "epoch": 2.3222549432057216, "step": 2760}, {"loss": 1.6082, "grad_norm": 0.44866397976875305, "learning_rate": 0.0002, "epoch": 2.3306689103912497, "step": 2770}, {"loss": 1.6087, "grad_norm": 0.5167056322097778, "learning_rate": 0.0002, "epoch": 2.3390828775767774, "step": 2780}, {"loss": 1.6136, "grad_norm": 0.45913267135620117, "learning_rate": 0.0002, "epoch": 2.3474968447623055, "step": 2790}, {"loss": 1.6564, "grad_norm": 0.45787590742111206, "learning_rate": 0.0002, "epoch": 2.3559108119478336, "step": 2800}, {"loss": 1.6868, "grad_norm": 0.4633352756500244, "learning_rate": 0.0002, "epoch": 2.3643247791333613, "step": 2810}, {"loss": 1.6316, "grad_norm": 0.46390071511268616, "learning_rate": 0.0002, "epoch": 2.3727387463188894, "step": 2820}, {"loss": 1.6039, "grad_norm": 0.4261005222797394, "learning_rate": 0.0002, "epoch": 2.3811527135044175, "step": 2830}, {"loss": 1.6364, "grad_norm": 0.4283634424209595, "learning_rate": 0.0002, "epoch": 2.389566680689945, "step": 2840}, {"loss": 1.6382, "grad_norm": 0.4955291450023651, "learning_rate": 0.0002, "epoch": 2.3979806478754733, "step": 2850}, {"loss": 1.6173, "grad_norm": 0.4740189015865326, "learning_rate": 0.0002, "epoch": 2.4063946150610014, "step": 2860}, {"loss": 1.6403, "grad_norm": 0.4222276508808136, "learning_rate": 0.0002, "epoch": 2.414808582246529, "step": 2870}, {"loss": 1.5602, "grad_norm": 0.4982149004936218, "learning_rate": 0.0002, "epoch": 2.423222549432057, "step": 2880}, {"loss": 1.6313, "grad_norm": 0.5217409133911133, "learning_rate": 0.0002, "epoch": 2.4316365166175853, "step": 2890}, {"loss": 1.5804, "grad_norm": 0.4555884897708893, "learning_rate": 0.0002, "epoch": 2.4400504838031134, "step": 2900}, {"loss": 1.6189, "grad_norm": 0.43178579211235046, "learning_rate": 0.0002, "epoch": 2.448464450988641, "step": 2910}, {"loss": 1.6824, "grad_norm": 0.4788478910923004, "learning_rate": 0.0002, "epoch": 2.456878418174169, "step": 2920}, {"loss": 1.6829, "grad_norm": 0.43689873814582825, "learning_rate": 0.0002, "epoch": 2.465292385359697, "step": 2930}, {"loss": 1.6196, "grad_norm": 0.5115197896957397, "learning_rate": 0.0002, "epoch": 2.473706352545225, "step": 2940}, {"loss": 1.689, "grad_norm": 0.5290159583091736, "learning_rate": 0.0002, "epoch": 2.482120319730753, "step": 2950}, {"loss": 1.6499, "grad_norm": 0.46042463183403015, "learning_rate": 0.0002, "epoch": 2.490534286916281, "step": 2960}, {"loss": 1.6664, "grad_norm": 0.4359915852546692, "learning_rate": 0.0002, "epoch": 2.498948254101809, "step": 2970}, {"loss": 1.5812, "grad_norm": 0.46352964639663696, "learning_rate": 0.0002, "epoch": 2.507362221287337, "step": 2980}, {"loss": 1.6501, "grad_norm": 0.5324268341064453, "learning_rate": 0.0002, "epoch": 2.515776188472865, "step": 2990}, {"loss": 1.6115, "grad_norm": 0.5929607152938843, "learning_rate": 0.0002, "epoch": 2.5241901556583928, "step": 3000}, {"loss": 1.6772, "grad_norm": 0.4811333417892456, "learning_rate": 0.0002, "epoch": 2.532604122843921, "step": 3010}, {"loss": 1.7023, "grad_norm": 0.4662701487541199, "learning_rate": 0.0002, "epoch": 2.541018090029449, "step": 3020}, {"loss": 1.5426, "grad_norm": 0.4582270681858063, "learning_rate": 0.0002, "epoch": 2.549432057214977, "step": 3030}, {"loss": 1.6737, "grad_norm": 0.4679982662200928, "learning_rate": 0.0002, "epoch": 2.557846024400505, "step": 3040}, {"loss": 1.5442, "grad_norm": 0.4380294680595398, "learning_rate": 0.0002, "epoch": 2.566259991586033, "step": 3050}, {"loss": 1.6055, "grad_norm": 0.44295763969421387, "learning_rate": 0.0002, "epoch": 2.5746739587715606, "step": 3060}, {"loss": 1.5775, "grad_norm": 0.5131027698516846, "learning_rate": 0.0002, "epoch": 2.5830879259570887, "step": 3070}, {"loss": 1.546, "grad_norm": 0.47567516565322876, "learning_rate": 0.0002, "epoch": 2.591501893142617, "step": 3080}, {"loss": 1.5671, "grad_norm": 0.49002596735954285, "learning_rate": 0.0002, "epoch": 2.599915860328145, "step": 3090}, {"loss": 1.5445, "grad_norm": 0.44856327772140503, "learning_rate": 0.0002, "epoch": 2.6083298275136726, "step": 3100}, {"loss": 1.5797, "grad_norm": 0.4480142593383789, "learning_rate": 0.0002, "epoch": 2.6167437946992007, "step": 3110}, {"loss": 1.7132, "grad_norm": 0.4317494034767151, "learning_rate": 0.0002, "epoch": 2.6251577618847284, "step": 3120}, {"loss": 1.6321, "grad_norm": 0.42580848932266235, "learning_rate": 0.0002, "epoch": 2.6335717290702565, "step": 3130}, {"loss": 1.6483, "grad_norm": 0.4516814947128296, "learning_rate": 0.0002, "epoch": 2.6419856962557846, "step": 3140}, {"loss": 1.695, "grad_norm": 0.4438435733318329, "learning_rate": 0.0002, "epoch": 2.6503996634413127, "step": 3150}, {"loss": 1.6938, "grad_norm": 0.4385356307029724, "learning_rate": 0.0002, "epoch": 2.6588136306268404, "step": 3160}, {"loss": 1.6139, "grad_norm": 0.5064112544059753, "learning_rate": 0.0002, "epoch": 2.6672275978123685, "step": 3170}, {"loss": 1.7189, "grad_norm": 0.49163177609443665, "learning_rate": 0.0002, "epoch": 2.6756415649978966, "step": 3180}, {"loss": 1.7323, "grad_norm": 0.49339258670806885, "learning_rate": 0.0002, "epoch": 2.6840555321834243, "step": 3190}, {"loss": 1.6508, "grad_norm": 0.440950870513916, "learning_rate": 0.0002, "epoch": 2.6924694993689524, "step": 3200}, {"loss": 1.6305, "grad_norm": 0.4283970594406128, "learning_rate": 0.0002, "epoch": 2.7008834665544805, "step": 3210}, {"loss": 1.5935, "grad_norm": 0.43875712156295776, "learning_rate": 0.0002, "epoch": 2.7092974337400086, "step": 3220}, {"loss": 1.6129, "grad_norm": 0.49332964420318604, "learning_rate": 0.0002, "epoch": 2.7177114009255363, "step": 3230}, {"loss": 1.642, "grad_norm": 0.5225692391395569, "learning_rate": 0.0002, "epoch": 2.7261253681110644, "step": 3240}, {"loss": 1.6759, "grad_norm": 0.4856489300727844, "learning_rate": 0.0002, "epoch": 2.734539335296592, "step": 3250}, {"loss": 1.6463, "grad_norm": 0.46918296813964844, "learning_rate": 0.0002, "epoch": 2.74295330248212, "step": 3260}, {"loss": 1.6819, "grad_norm": 0.4802931249141693, "learning_rate": 0.0002, "epoch": 2.7513672696676483, "step": 3270}, {"loss": 1.6246, "grad_norm": 0.4485355615615845, "learning_rate": 0.0002, "epoch": 2.7597812368531764, "step": 3280}, {"loss": 1.6251, "grad_norm": 0.43944594264030457, "learning_rate": 0.0002, "epoch": 2.768195204038704, "step": 3290}, {"loss": 1.6501, "grad_norm": 0.46847742795944214, "learning_rate": 0.0002, "epoch": 2.7766091712242322, "step": 3300}, {"loss": 1.5969, "grad_norm": 0.4816027879714966, "learning_rate": 0.0002, "epoch": 2.7850231384097603, "step": 3310}, {"loss": 1.6293, "grad_norm": 0.453960120677948, "learning_rate": 0.0002, "epoch": 2.793437105595288, "step": 3320}, {"loss": 1.6429, "grad_norm": 0.4816017150878906, "learning_rate": 0.0002, "epoch": 2.801851072780816, "step": 3330}, {"loss": 1.6683, "grad_norm": 0.4461034834384918, "learning_rate": 0.0002, "epoch": 2.8102650399663442, "step": 3340}, {"loss": 1.7048, "grad_norm": 0.48821821808815, "learning_rate": 0.0002, "epoch": 2.8186790071518724, "step": 3350}, {"loss": 1.6076, "grad_norm": 0.4574853777885437, "learning_rate": 0.0002, "epoch": 2.8270929743374, "step": 3360}, {"loss": 1.6651, "grad_norm": 0.42062026262283325, "learning_rate": 0.0002, "epoch": 2.835506941522928, "step": 3370}, {"loss": 1.624, "grad_norm": 0.4499834477901459, "learning_rate": 0.0002, "epoch": 2.843920908708456, "step": 3380}, {"loss": 1.621, "grad_norm": 0.4780360758304596, "learning_rate": 0.0002, "epoch": 2.852334875893984, "step": 3390}, {"loss": 1.5882, "grad_norm": 0.45422887802124023, "learning_rate": 0.0002, "epoch": 2.860748843079512, "step": 3400}, {"loss": 1.6028, "grad_norm": 0.4590015709400177, "learning_rate": 0.0002, "epoch": 2.86916281026504, "step": 3410}, {"loss": 1.6746, "grad_norm": 0.45689624547958374, "learning_rate": 0.0002, "epoch": 2.877576777450568, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.46953922510147095, "learning_rate": 0.0002, "epoch": 2.885990744636096, "step": 3430}, {"loss": 1.6015, "grad_norm": 0.4791966378688812, "learning_rate": 0.0002, "epoch": 2.8944047118216236, "step": 3440}, {"loss": 1.694, "grad_norm": 0.4842296242713928, "learning_rate": 0.0002, "epoch": 2.9028186790071517, "step": 3450}, {"loss": 1.6326, "grad_norm": 0.47219768166542053, "learning_rate": 0.0002, "epoch": 2.91123264619268, "step": 3460}, {"loss": 1.6486, "grad_norm": 0.4622127115726471, "learning_rate": 0.0002, "epoch": 2.919646613378208, "step": 3470}, {"loss": 1.6485, "grad_norm": 0.46832820773124695, "learning_rate": 0.0002, "epoch": 2.9280605805637356, "step": 3480}, {"loss": 1.6366, "grad_norm": 0.44582483172416687, "learning_rate": 0.0002, "epoch": 2.9364745477492638, "step": 3490}, {"loss": 1.6859, "grad_norm": 0.4987219274044037, "learning_rate": 0.0002, "epoch": 2.944888514934792, "step": 3500}, {"loss": 1.5991, "grad_norm": 0.43750956654548645, "learning_rate": 0.0002, "epoch": 2.9533024821203195, "step": 3510}, {"loss": 1.6236, "grad_norm": 0.49962925910949707, "learning_rate": 0.0002, "epoch": 2.9617164493058477, "step": 3520}, {"loss": 1.5859, "grad_norm": 0.5189590454101562, "learning_rate": 0.0002, "epoch": 2.9701304164913758, "step": 3530}, {"loss": 1.6688, "grad_norm": 0.391317754983902, "learning_rate": 0.0002, "epoch": 2.978544383676904, "step": 3540}, {"loss": 1.5884, "grad_norm": 0.44934695959091187, "learning_rate": 0.0002, "epoch": 2.9869583508624316, "step": 3550}, {"loss": 1.5688, "grad_norm": 0.4740142226219177, "learning_rate": 0.0002, "epoch": 2.9953723180479597, "step": 3560}, {"eval_loss": 1.8266887664794922, "eval_runtime": 37.9445, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.9995793016407237, "step": 3565}, {"loss": 1.5939, "grad_norm": 0.4523724615573883, "learning_rate": 0.0002, "epoch": 3.003786285233488, "step": 3570}, {"loss": 1.526, "grad_norm": 0.5261380076408386, "learning_rate": 0.0002, "epoch": 3.0122002524190155, "step": 3580}, {"loss": 1.4946, "grad_norm": 0.48664888739585876, "learning_rate": 0.0002, "epoch": 3.0206142196045436, "step": 3590}, {"loss": 1.5193, "grad_norm": 0.5070882439613342, "learning_rate": 0.0002, "epoch": 3.0290281867900717, "step": 3600}, {"loss": 1.5316, "grad_norm": 0.5816011428833008, "learning_rate": 0.0002, "epoch": 3.0374421539755994, "step": 3610}, {"loss": 1.5682, "grad_norm": 0.6610211730003357, "learning_rate": 0.0002, "epoch": 3.0458561211611275, "step": 3620}, {"loss": 1.5699, "grad_norm": 0.5257703065872192, "learning_rate": 0.0002, "epoch": 3.0542700883466556, "step": 3630}, {"loss": 1.4438, "grad_norm": 0.5574390888214111, "learning_rate": 0.0002, "epoch": 3.0626840555321833, "step": 3640}, {"loss": 1.547, "grad_norm": 0.5682297348976135, "learning_rate": 0.0002, "epoch": 3.0710980227177114, "step": 3650}, {"loss": 1.5743, "grad_norm": 0.5798383355140686, "learning_rate": 0.0002, "epoch": 3.0795119899032395, "step": 3660}, {"loss": 1.4339, "grad_norm": 0.5458289980888367, "learning_rate": 0.0002, "epoch": 3.087925957088767, "step": 3670}, {"loss": 1.46, "grad_norm": 0.5599102973937988, "learning_rate": 0.0002, "epoch": 3.0963399242742953, "step": 3680}, {"loss": 1.4589, "grad_norm": 0.5023021697998047, "learning_rate": 0.0002, "epoch": 3.1047538914598234, "step": 3690}, {"loss": 1.5114, "grad_norm": 0.5448206067085266, "learning_rate": 0.0002, "epoch": 3.113167858645351, "step": 3700}, {"loss": 1.4692, "grad_norm": 0.5760458707809448, "learning_rate": 0.0002, "epoch": 3.121581825830879, "step": 3710}, {"loss": 1.4789, "grad_norm": 0.6018968224525452, "learning_rate": 0.0002, "epoch": 3.1299957930164073, "step": 3720}, {"loss": 1.5518, "grad_norm": 0.5767101049423218, "learning_rate": 0.0002, "epoch": 3.1384097602019354, "step": 3730}, {"loss": 1.5032, "grad_norm": 0.5333963632583618, "learning_rate": 0.0002, "epoch": 3.146823727387463, "step": 3740}, {"loss": 1.4812, "grad_norm": 0.5918396711349487, "learning_rate": 0.0002, "epoch": 3.155237694572991, "step": 3750}, {"loss": 1.4618, "grad_norm": 0.5931203365325928, "learning_rate": 0.0002, "epoch": 3.1636516617585193, "step": 3760}, {"loss": 1.5592, "grad_norm": 0.6562168598175049, "learning_rate": 0.0002, "epoch": 3.172065628944047, "step": 3770}, {"loss": 1.4932, "grad_norm": 0.5820156335830688, "learning_rate": 0.0002, "epoch": 3.180479596129575, "step": 3780}, {"loss": 1.4523, "grad_norm": 0.5784737467765808, "learning_rate": 0.0002, "epoch": 3.188893563315103, "step": 3790}, {"loss": 1.498, "grad_norm": 0.5506529808044434, "learning_rate": 0.0002, "epoch": 3.197307530500631, "step": 3800}, {"loss": 1.4819, "grad_norm": 0.6101595163345337, "learning_rate": 0.0002, "epoch": 3.205721497686159, "step": 3810}, {"loss": 1.5185, "grad_norm": 0.5597806572914124, "learning_rate": 0.0002, "epoch": 3.214135464871687, "step": 3820}, {"loss": 1.5664, "grad_norm": 0.5641011595726013, "learning_rate": 0.0002, "epoch": 3.222549432057215, "step": 3830}, {"loss": 1.4702, "grad_norm": 0.5892080068588257, "learning_rate": 0.0002, "epoch": 3.230963399242743, "step": 3840}, {"loss": 1.4194, "grad_norm": 0.6034760475158691, "learning_rate": 0.0002, "epoch": 3.239377366428271, "step": 3850}, {"loss": 1.5499, "grad_norm": 0.5112439393997192, "learning_rate": 0.0002, "epoch": 3.247791333613799, "step": 3860}, {"loss": 1.5132, "grad_norm": 0.56565922498703, "learning_rate": 0.0002, "epoch": 3.256205300799327, "step": 3870}, {"loss": 1.4892, "grad_norm": 0.6155247092247009, "learning_rate": 0.0002, "epoch": 3.264619267984855, "step": 3880}, {"loss": 1.5118, "grad_norm": 0.6064623594284058, "learning_rate": 0.0002, "epoch": 3.273033235170383, "step": 3890}, {"loss": 1.5236, "grad_norm": 0.6313768029212952, "learning_rate": 0.0002, "epoch": 3.2814472023559107, "step": 3900}, {"loss": 1.5551, "grad_norm": 0.5903939008712769, "learning_rate": 0.0002, "epoch": 3.289861169541439, "step": 3910}, {"loss": 1.5703, "grad_norm": 0.5770667195320129, "learning_rate": 0.0002, "epoch": 3.298275136726967, "step": 3920}, {"loss": 1.5159, "grad_norm": 0.5785196423530579, "learning_rate": 0.0002, "epoch": 3.3066891039124946, "step": 3930}, {"loss": 1.5277, "grad_norm": 0.6468310356140137, "learning_rate": 0.0002, "epoch": 3.3151030710980227, "step": 3940}, {"loss": 1.6002, "grad_norm": 0.6200279593467712, "learning_rate": 0.0002, "epoch": 3.323517038283551, "step": 3950}, {"loss": 1.5264, "grad_norm": 0.5779302716255188, "learning_rate": 0.0002, "epoch": 3.3319310054690785, "step": 3960}, {"loss": 1.4861, "grad_norm": 0.5463796854019165, "learning_rate": 0.0002, "epoch": 3.3403449726546066, "step": 3970}, {"loss": 1.541, "grad_norm": 0.6117855906486511, "learning_rate": 0.0002, "epoch": 3.3487589398401347, "step": 3980}, {"loss": 1.5566, "grad_norm": 0.5554766058921814, "learning_rate": 0.0002, "epoch": 3.357172907025663, "step": 3990}, {"loss": 1.5004, "grad_norm": 0.6012870073318481, "learning_rate": 0.0002, "epoch": 3.3655868742111905, "step": 4000}, {"loss": 1.473, "grad_norm": 0.5443974137306213, "learning_rate": 0.0002, "epoch": 3.3740008413967186, "step": 4010}, {"loss": 1.5139, "grad_norm": 0.6636057496070862, "learning_rate": 0.0002, "epoch": 3.3824148085822463, "step": 4020}, {"loss": 1.5141, "grad_norm": 0.5801246166229248, "learning_rate": 0.0002, "epoch": 3.3908287757677744, "step": 4030}, {"loss": 1.5026, "grad_norm": 0.5668839812278748, "learning_rate": 0.0002, "epoch": 3.3992427429533025, "step": 4040}, {"loss": 1.523, "grad_norm": 0.7763481736183167, "learning_rate": 0.0002, "epoch": 3.4076567101388306, "step": 4050}, {"loss": 1.4932, "grad_norm": 0.6675992608070374, "learning_rate": 0.0002, "epoch": 3.4160706773243583, "step": 4060}, {"loss": 1.4959, "grad_norm": 0.6290077567100525, "learning_rate": 0.0002, "epoch": 3.4244846445098864, "step": 4070}, {"loss": 1.5766, "grad_norm": 0.6040239930152893, "learning_rate": 0.0002, "epoch": 3.4328986116954145, "step": 4080}, {"loss": 1.5711, "grad_norm": 0.6237877607345581, "learning_rate": 0.0002, "epoch": 3.441312578880942, "step": 4090}, {"loss": 1.4961, "grad_norm": 0.5343508124351501, "learning_rate": 0.0002, "epoch": 3.4497265460664703, "step": 4100}, {"loss": 1.5123, "grad_norm": 0.6817412972450256, "learning_rate": 0.0002, "epoch": 3.4581405132519984, "step": 4110}, {"loss": 1.5377, "grad_norm": 0.7115170359611511, "learning_rate": 0.0002, "epoch": 3.466554480437526, "step": 4120}, {"loss": 1.5275, "grad_norm": 0.6127332448959351, "learning_rate": 0.0002, "epoch": 3.4749684476230542, "step": 4130}, {"loss": 1.557, "grad_norm": 0.5745994448661804, "learning_rate": 0.0002, "epoch": 3.4833824148085824, "step": 4140}, {"loss": 1.4873, "grad_norm": 0.6248795390129089, "learning_rate": 0.0002, "epoch": 3.49179638199411, "step": 4150}, {"loss": 1.4885, "grad_norm": 0.5821124911308289, "learning_rate": 0.0002, "epoch": 3.500210349179638, "step": 4160}, {"loss": 1.4937, "grad_norm": 0.561416506767273, "learning_rate": 0.0002, "epoch": 3.5086243163651663, "step": 4170}, {"loss": 1.5453, "grad_norm": 0.5848962664604187, "learning_rate": 0.0002, "epoch": 3.5170382835506944, "step": 4180}, {"loss": 1.5892, "grad_norm": 0.5335569977760315, "learning_rate": 0.0002, "epoch": 3.525452250736222, "step": 4190}, {"loss": 1.5152, "grad_norm": 0.547964870929718, "learning_rate": 0.0002, "epoch": 3.53386621792175, "step": 4200}, {"loss": 1.4887, "grad_norm": 0.6157727241516113, "learning_rate": 0.0002, "epoch": 3.542280185107278, "step": 4210}, {"loss": 1.5484, "grad_norm": 0.6163121461868286, "learning_rate": 0.0002, "epoch": 3.550694152292806, "step": 4220}, {"loss": 1.5833, "grad_norm": 0.5844616293907166, "learning_rate": 0.0002, "epoch": 3.559108119478334, "step": 4230}, {"loss": 1.5305, "grad_norm": 0.7104926109313965, "learning_rate": 0.0002, "epoch": 3.567522086663862, "step": 4240}, {"loss": 1.5161, "grad_norm": 0.5055213570594788, "learning_rate": 0.0002, "epoch": 3.57593605384939, "step": 4250}, {"loss": 1.482, "grad_norm": 0.611676812171936, "learning_rate": 0.0002, "epoch": 3.584350021034918, "step": 4260}, {"loss": 1.5048, "grad_norm": 0.6326440572738647, "learning_rate": 0.0002, "epoch": 3.592763988220446, "step": 4270}, {"loss": 1.5122, "grad_norm": 0.6290925741195679, "learning_rate": 0.0002, "epoch": 3.6011779554059737, "step": 4280}, {"loss": 1.5654, "grad_norm": 0.5691978931427002, "learning_rate": 0.0002, "epoch": 3.609591922591502, "step": 4290}, {"loss": 1.4854, "grad_norm": 0.6071329116821289, "learning_rate": 0.0002, "epoch": 3.61800588977703, "step": 4300}, {"loss": 1.5336, "grad_norm": 0.606573224067688, "learning_rate": 0.0002, "epoch": 3.626419856962558, "step": 4310}, {"loss": 1.6437, "grad_norm": 0.5515419244766235, "learning_rate": 0.0002, "epoch": 3.6348338241480858, "step": 4320}, {"loss": 1.498, "grad_norm": 0.5964660048484802, "learning_rate": 0.0002, "epoch": 3.643247791333614, "step": 4330}, {"loss": 1.544, "grad_norm": 0.5774146914482117, "learning_rate": 0.0002, "epoch": 3.6516617585191415, "step": 4340}, {"loss": 1.5566, "grad_norm": 0.5732731223106384, "learning_rate": 0.0002, "epoch": 3.6600757257046697, "step": 4350}, {"loss": 1.5682, "grad_norm": 0.7354163527488708, "learning_rate": 0.0002, "epoch": 3.6684896928901978, "step": 4360}, {"loss": 1.5225, "grad_norm": 0.6220902800559998, "learning_rate": 0.0002, "epoch": 3.676903660075726, "step": 4370}, {"loss": 1.4838, "grad_norm": 0.6053991317749023, "learning_rate": 0.0002, "epoch": 3.6853176272612536, "step": 4380}, {"loss": 1.5161, "grad_norm": 0.67010897397995, "learning_rate": 0.0002, "epoch": 3.6937315944467817, "step": 4390}, {"loss": 1.5381, "grad_norm": 0.6139186024665833, "learning_rate": 0.0002, "epoch": 3.70214556163231, "step": 4400}, {"loss": 1.5088, "grad_norm": 0.5433071851730347, "learning_rate": 0.0002, "epoch": 3.7105595288178375, "step": 4410}, {"loss": 1.5337, "grad_norm": 0.5453870296478271, "learning_rate": 0.0002, "epoch": 3.7189734960033656, "step": 4420}, {"loss": 1.4549, "grad_norm": 0.6401727199554443, "learning_rate": 0.0002, "epoch": 3.7273874631888937, "step": 4430}, {"loss": 1.503, "grad_norm": 0.6049367189407349, "learning_rate": 0.0002, "epoch": 3.735801430374422, "step": 4440}, {"loss": 1.5268, "grad_norm": 0.5740529298782349, "learning_rate": 0.0002, "epoch": 3.7442153975599495, "step": 4450}, {"loss": 1.5183, "grad_norm": 0.6521880626678467, "learning_rate": 0.0002, "epoch": 3.7526293647454776, "step": 4460}, {"loss": 1.5741, "grad_norm": 0.7096368074417114, "learning_rate": 0.0002, "epoch": 3.7610433319310053, "step": 4470}, {"loss": 1.5786, "grad_norm": 0.5886474251747131, "learning_rate": 0.0002, "epoch": 3.7694572991165334, "step": 4480}, {"loss": 1.5887, "grad_norm": 0.5821043252944946, "learning_rate": 0.0002, "epoch": 3.7778712663020615, "step": 4490}, {"loss": 1.5777, "grad_norm": 0.628892183303833, "learning_rate": 0.0002, "epoch": 3.7862852334875896, "step": 4500}, {"loss": 1.4708, "grad_norm": 0.5962669849395752, "learning_rate": 0.0002, "epoch": 3.7946992006731173, "step": 4510}, {"loss": 1.5267, "grad_norm": 0.6635549068450928, "learning_rate": 0.0002, "epoch": 3.8031131678586454, "step": 4520}, {"loss": 1.5058, "grad_norm": 0.6010760068893433, "learning_rate": 0.0002, "epoch": 3.811527135044173, "step": 4530}, {"loss": 1.6228, "grad_norm": 0.6322658658027649, "learning_rate": 0.0002, "epoch": 3.819941102229701, "step": 4540}, {"loss": 1.5029, "grad_norm": 0.5893137454986572, "learning_rate": 0.0002, "epoch": 3.8283550694152293, "step": 4550}, {"loss": 1.5435, "grad_norm": 0.7829602360725403, "learning_rate": 0.0002, "epoch": 3.8367690366007574, "step": 4560}, {"loss": 1.5453, "grad_norm": 0.6190396547317505, "learning_rate": 0.0002, "epoch": 3.845183003786285, "step": 4570}, {"loss": 1.5292, "grad_norm": 0.6662813425064087, "learning_rate": 0.0002, "epoch": 3.853596970971813, "step": 4580}, {"loss": 1.5065, "grad_norm": 0.5809855461120605, "learning_rate": 0.0002, "epoch": 3.8620109381573413, "step": 4590}, {"loss": 1.5041, "grad_norm": 0.5779069662094116, "learning_rate": 0.0002, "epoch": 3.870424905342869, "step": 4600}, {"loss": 1.498, "grad_norm": 0.5603038668632507, "learning_rate": 0.0002, "epoch": 3.878838872528397, "step": 4610}, {"loss": 1.5372, "grad_norm": 0.6274181008338928, "learning_rate": 0.0002, "epoch": 3.887252839713925, "step": 4620}, {"loss": 1.4996, "grad_norm": 0.6810959577560425, "learning_rate": 0.0002, "epoch": 3.8956668068994533, "step": 4630}, {"loss": 1.4956, "grad_norm": 0.5647315979003906, "learning_rate": 0.0002, "epoch": 3.904080774084981, "step": 4640}, {"loss": 1.5424, "grad_norm": 0.6830295324325562, "learning_rate": 0.0002, "epoch": 3.912494741270509, "step": 4650}, {"loss": 1.535, "grad_norm": 0.652565598487854, "learning_rate": 0.0002, "epoch": 3.920908708456037, "step": 4660}, {"loss": 1.4772, "grad_norm": 0.5806284546852112, "learning_rate": 0.0002, "epoch": 3.929322675641565, "step": 4670}, {"loss": 1.5812, "grad_norm": 0.6825073957443237, "learning_rate": 0.0002, "epoch": 3.937736642827093, "step": 4680}, {"loss": 1.5516, "grad_norm": 0.6149451732635498, "learning_rate": 0.0002, "epoch": 3.946150610012621, "step": 4690}, {"loss": 1.5608, "grad_norm": 0.6152557134628296, "learning_rate": 0.0002, "epoch": 3.954564577198149, "step": 4700}, {"loss": 1.4897, "grad_norm": 0.6239011883735657, "learning_rate": 0.0002, "epoch": 3.962978544383677, "step": 4710}, {"loss": 1.538, "grad_norm": 0.6485443115234375, "learning_rate": 0.0002, "epoch": 3.971392511569205, "step": 4720}, {"loss": 1.5226, "grad_norm": 0.6449228525161743, "learning_rate": 0.0002, "epoch": 3.9798064787547327, "step": 4730}, {"loss": 1.5087, "grad_norm": 0.6526407599449158, "learning_rate": 0.0002, "epoch": 3.988220445940261, "step": 4740}, {"loss": 1.5026, "grad_norm": 0.6277706027030945, "learning_rate": 0.0002, "epoch": 3.996634413125789, "step": 4750}, {"eval_loss": 1.871641755104065, "eval_runtime": 37.9637, "eval_samples_per_second": 13.566, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4754}, {"loss": 1.4744, "grad_norm": 0.6994837522506714, "learning_rate": 0.0002, "epoch": 4.005048380311317, "step": 4760}, {"loss": 1.4433, "grad_norm": 0.8728373050689697, "learning_rate": 0.0002, "epoch": 4.013462347496845, "step": 4770}, {"loss": 1.3329, "grad_norm": 0.688679575920105, "learning_rate": 0.0002, "epoch": 4.021876314682372, "step": 4780}, {"loss": 1.3999, "grad_norm": 0.6313387155532837, "learning_rate": 0.0002, "epoch": 4.0302902818679005, "step": 4790}, {"loss": 1.3346, "grad_norm": 0.6577984690666199, "learning_rate": 0.0002, "epoch": 4.038704249053429, "step": 4800}, {"loss": 1.3403, "grad_norm": 0.7938185930252075, "learning_rate": 0.0002, "epoch": 4.047118216238957, "step": 4810}, {"loss": 1.3716, "grad_norm": 0.760399580001831, "learning_rate": 0.0002, "epoch": 4.055532183424485, "step": 4820}, {"loss": 1.4321, "grad_norm": 0.7329602241516113, "learning_rate": 0.0002, "epoch": 4.063946150610013, "step": 4830}, {"loss": 1.4133, "grad_norm": 0.7778576016426086, "learning_rate": 0.0002, "epoch": 4.07236011779554, "step": 4840}, {"loss": 1.4372, "grad_norm": 0.8235865235328674, "learning_rate": 0.0002, "epoch": 4.080774084981068, "step": 4850}, {"loss": 1.3719, "grad_norm": 0.7743754386901855, "learning_rate": 0.0002, "epoch": 4.089188052166596, "step": 4860}, {"loss": 1.3787, "grad_norm": 0.8145367503166199, "learning_rate": 0.0002, "epoch": 4.0976020193521245, "step": 4870}, {"loss": 1.356, "grad_norm": 0.8517307639122009, "learning_rate": 0.0002, "epoch": 4.106015986537653, "step": 4880}, {"loss": 1.4191, "grad_norm": 0.8208953142166138, "learning_rate": 0.0002, "epoch": 4.114429953723181, "step": 4890}, {"loss": 1.3189, "grad_norm": 0.8437790870666504, "learning_rate": 0.0002, "epoch": 4.122843920908709, "step": 4900}, {"loss": 1.3987, "grad_norm": 0.716672420501709, "learning_rate": 0.0002, "epoch": 4.131257888094236, "step": 4910}, {"loss": 1.4392, "grad_norm": 0.7656235098838806, "learning_rate": 0.0002, "epoch": 4.139671855279764, "step": 4920}, {"loss": 1.3408, "grad_norm": 0.7209306955337524, "learning_rate": 0.0002, "epoch": 4.148085822465292, "step": 4930}, {"loss": 1.3639, "grad_norm": 0.7731267809867859, "learning_rate": 0.0002, "epoch": 4.1564997896508205, "step": 4940}, {"loss": 1.4151, "grad_norm": 0.7477553486824036, "learning_rate": 0.0002, "epoch": 4.164913756836349, "step": 4950}, {"loss": 1.3485, "grad_norm": 0.7372981309890747, "learning_rate": 0.0002, "epoch": 4.173327724021877, "step": 4960}, {"loss": 1.3901, "grad_norm": 0.6582154035568237, "learning_rate": 0.0002, "epoch": 4.181741691207404, "step": 4970}, {"loss": 1.3343, "grad_norm": 0.7003206610679626, "learning_rate": 0.0002, "epoch": 4.190155658392932, "step": 4980}, {"loss": 1.4098, "grad_norm": 0.735223650932312, "learning_rate": 0.0002, "epoch": 4.19856962557846, "step": 4990}, {"loss": 1.3564, "grad_norm": 0.7832302451133728, "learning_rate": 0.0002, "epoch": 4.206983592763988, "step": 5000}, {"loss": 1.3622, "grad_norm": 0.8819546103477478, "learning_rate": 0.0002, "epoch": 4.215397559949516, "step": 5010}, {"loss": 1.4438, "grad_norm": 0.9325336813926697, "learning_rate": 0.0002, "epoch": 4.2238115271350445, "step": 5020}, {"loss": 1.3886, "grad_norm": 0.7007517218589783, "learning_rate": 0.0002, "epoch": 4.232225494320572, "step": 5030}, {"loss": 1.3683, "grad_norm": 0.7118321061134338, "learning_rate": 0.0002, "epoch": 4.2406394615061, "step": 5040}, {"loss": 1.2365, "grad_norm": 0.6578946709632874, "learning_rate": 0.0002, "epoch": 4.249053428691628, "step": 5050}, {"loss": 1.3696, "grad_norm": 0.9438983798027039, "learning_rate": 0.0002, "epoch": 4.257467395877156, "step": 5060}, {"loss": 1.3868, "grad_norm": 0.703037679195404, "learning_rate": 0.0002, "epoch": 4.265881363062684, "step": 5070}, {"loss": 1.3687, "grad_norm": 0.7286025285720825, "learning_rate": 0.0002, "epoch": 4.274295330248212, "step": 5080}, {"loss": 1.3605, "grad_norm": 0.750689685344696, "learning_rate": 0.0002, "epoch": 4.28270929743374, "step": 5090}, {"loss": 1.5089, "grad_norm": 0.869753360748291, "learning_rate": 0.0002, "epoch": 4.291123264619268, "step": 5100}, {"loss": 1.4128, "grad_norm": 0.8712980151176453, "learning_rate": 0.0002, "epoch": 4.299537231804796, "step": 5110}, {"loss": 1.3977, "grad_norm": 0.690263569355011, "learning_rate": 0.0002, "epoch": 4.307951198990324, "step": 5120}, {"loss": 1.4088, "grad_norm": 0.7114760279655457, "learning_rate": 0.0002, "epoch": 4.316365166175852, "step": 5130}, {"loss": 1.363, "grad_norm": 0.7588112354278564, "learning_rate": 0.0002, "epoch": 4.32477913336138, "step": 5140}, {"loss": 1.4408, "grad_norm": 0.7556202411651611, "learning_rate": 0.0002, "epoch": 4.333193100546908, "step": 5150}, {"loss": 1.4203, "grad_norm": 0.8357610702514648, "learning_rate": 0.0002, "epoch": 4.341607067732435, "step": 5160}, {"loss": 1.3348, "grad_norm": 0.8054035902023315, "learning_rate": 0.0002, "epoch": 4.3500210349179635, "step": 5170}, {"loss": 1.3109, "grad_norm": 0.7637107968330383, "learning_rate": 0.0002, "epoch": 4.358435002103492, "step": 5180}, {"loss": 1.3744, "grad_norm": 0.757481038570404, "learning_rate": 0.0002, "epoch": 4.36684896928902, "step": 5190}, {"loss": 1.3622, "grad_norm": 0.7185863852500916, "learning_rate": 0.0002, "epoch": 4.375262936474548, "step": 5200}, {"loss": 1.3896, "grad_norm": 0.7326455116271973, "learning_rate": 0.0002, "epoch": 4.383676903660076, "step": 5210}, {"loss": 1.4098, "grad_norm": 0.7980523109436035, "learning_rate": 0.0002, "epoch": 4.392090870845603, "step": 5220}, {"loss": 1.3783, "grad_norm": 0.8526999354362488, "learning_rate": 0.0002, "epoch": 4.400504838031131, "step": 5230}, {"loss": 1.4022, "grad_norm": 0.7012337446212769, "learning_rate": 0.0002, "epoch": 4.4089188052166595, "step": 5240}, {"loss": 1.3552, "grad_norm": 0.8217827677726746, "learning_rate": 0.0002, "epoch": 4.417332772402188, "step": 5250}, {"loss": 1.3482, "grad_norm": 0.7141005396842957, "learning_rate": 0.0002, "epoch": 4.425746739587716, "step": 5260}, {"loss": 1.3699, "grad_norm": 0.7094302177429199, "learning_rate": 0.0002, "epoch": 4.434160706773244, "step": 5270}, {"loss": 1.3527, "grad_norm": 0.7234613299369812, "learning_rate": 0.0002, "epoch": 4.442574673958772, "step": 5280}, {"loss": 1.4769, "grad_norm": 0.7530457973480225, "learning_rate": 0.0002, "epoch": 4.450988641144299, "step": 5290}, {"loss": 1.3944, "grad_norm": 0.7300912141799927, "learning_rate": 0.0002, "epoch": 4.459402608329827, "step": 5300}, {"loss": 1.3844, "grad_norm": 0.825443685054779, "learning_rate": 0.0002, "epoch": 4.467816575515355, "step": 5310}, {"loss": 1.3648, "grad_norm": 0.7559658885002136, "learning_rate": 0.0002, "epoch": 4.4762305427008835, "step": 5320}, {"loss": 1.4364, "grad_norm": 0.8817561268806458, "learning_rate": 0.0002, "epoch": 4.484644509886412, "step": 5330}, {"loss": 1.3618, "grad_norm": 0.8203575611114502, "learning_rate": 0.0002, "epoch": 4.49305847707194, "step": 5340}, {"loss": 1.3996, "grad_norm": 0.7677690982818604, "learning_rate": 0.0002, "epoch": 4.501472444257468, "step": 5350}, {"loss": 1.4142, "grad_norm": 0.657085120677948, "learning_rate": 0.0002, "epoch": 4.509886411442995, "step": 5360}, {"loss": 1.3722, "grad_norm": 0.7939504384994507, "learning_rate": 0.0002, "epoch": 4.518300378628523, "step": 5370}, {"loss": 1.4361, "grad_norm": 0.6971889138221741, "learning_rate": 0.0002, "epoch": 4.526714345814051, "step": 5380}, {"loss": 1.3637, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 4.535128312999579, "step": 5390}, {"loss": 1.341, "grad_norm": 0.8504858613014221, "learning_rate": 0.0002, "epoch": 4.5435422801851075, "step": 5400}, {"loss": 1.4026, "grad_norm": 0.9134073853492737, "learning_rate": 0.0002, "epoch": 4.551956247370635, "step": 5410}, {"loss": 1.4375, "grad_norm": 0.7765598893165588, "learning_rate": 0.0002, "epoch": 4.560370214556163, "step": 5420}, {"loss": 1.4832, "grad_norm": 0.6991009712219238, "learning_rate": 0.0002, "epoch": 4.568784181741691, "step": 5430}, {"loss": 1.4021, "grad_norm": 0.8393039107322693, "learning_rate": 0.0002, "epoch": 4.577198148927219, "step": 5440}, {"loss": 1.3976, "grad_norm": 0.7685918211936951, "learning_rate": 0.0002, "epoch": 4.585612116112747, "step": 5450}, {"loss": 1.3883, "grad_norm": 0.7135679721832275, "learning_rate": 0.0002, "epoch": 4.594026083298275, "step": 5460}, {"loss": 1.4083, "grad_norm": 0.6728870868682861, "learning_rate": 0.0002, "epoch": 4.6024400504838034, "step": 5470}, {"loss": 1.3698, "grad_norm": 0.7139479517936707, "learning_rate": 0.0002, "epoch": 4.610854017669331, "step": 5480}, {"loss": 1.3498, "grad_norm": 0.8476598858833313, "learning_rate": 0.0002, "epoch": 4.619267984854859, "step": 5490}, {"loss": 1.3389, "grad_norm": 0.8034361004829407, "learning_rate": 0.0002, "epoch": 4.627681952040387, "step": 5500}, {"loss": 1.4179, "grad_norm": 0.7452183961868286, "learning_rate": 0.0002, "epoch": 4.636095919225915, "step": 5510}, {"loss": 1.4031, "grad_norm": 0.8394148945808411, "learning_rate": 0.0002, "epoch": 4.644509886411443, "step": 5520}, {"loss": 1.4561, "grad_norm": 0.7480153441429138, "learning_rate": 0.0002, "epoch": 4.652923853596971, "step": 5530}, {"loss": 1.378, "grad_norm": 0.7781714797019958, "learning_rate": 0.0002, "epoch": 4.661337820782499, "step": 5540}, {"loss": 1.3924, "grad_norm": 1.0058213472366333, "learning_rate": 0.0002, "epoch": 4.669751787968027, "step": 5550}, {"loss": 1.4198, "grad_norm": 0.7403179407119751, "learning_rate": 0.0002, "epoch": 4.678165755153555, "step": 5560}, {"loss": 1.4328, "grad_norm": 0.7270476818084717, "learning_rate": 0.0002, "epoch": 4.686579722339083, "step": 5570}, {"loss": 1.378, "grad_norm": 0.760877788066864, "learning_rate": 0.0002, "epoch": 4.694993689524611, "step": 5580}, {"loss": 1.387, "grad_norm": 0.8097004890441895, "learning_rate": 0.0002, "epoch": 4.703407656710139, "step": 5590}, {"loss": 1.3661, "grad_norm": 0.9096523523330688, "learning_rate": 0.0002, "epoch": 4.711821623895667, "step": 5600}, {"loss": 1.4012, "grad_norm": 0.7262444496154785, "learning_rate": 0.0002, "epoch": 4.720235591081195, "step": 5610}, {"loss": 1.422, "grad_norm": 0.8207762837409973, "learning_rate": 0.0002, "epoch": 4.7286495582667225, "step": 5620}, {"loss": 1.4017, "grad_norm": 0.8089601993560791, "learning_rate": 0.0002, "epoch": 4.737063525452251, "step": 5630}, {"loss": 1.3675, "grad_norm": 0.7609543800354004, "learning_rate": 0.0002, "epoch": 4.745477492637779, "step": 5640}, {"loss": 1.4085, "grad_norm": 0.7273501753807068, "learning_rate": 0.0002, "epoch": 4.753891459823307, "step": 5650}, {"loss": 1.3849, "grad_norm": 0.7800219058990479, "learning_rate": 0.0002, "epoch": 4.762305427008835, "step": 5660}, {"loss": 1.4319, "grad_norm": 0.8558377623558044, "learning_rate": 0.0002, "epoch": 4.770719394194362, "step": 5670}, {"loss": 1.3831, "grad_norm": 0.7131547927856445, "learning_rate": 0.0002, "epoch": 4.77913336137989, "step": 5680}, {"loss": 1.407, "grad_norm": 0.7651025056838989, "learning_rate": 0.0002, "epoch": 4.787547328565418, "step": 5690}, {"loss": 1.3882, "grad_norm": 0.8129976391792297, "learning_rate": 0.0002, "epoch": 4.7959612957509465, "step": 5700}, {"loss": 1.4347, "grad_norm": 0.8019895553588867, "learning_rate": 0.0002, "epoch": 4.804375262936475, "step": 5710}, {"loss": 1.3961, "grad_norm": 0.7692018151283264, "learning_rate": 0.0002, "epoch": 4.812789230122003, "step": 5720}, {"loss": 1.419, "grad_norm": 0.6893943548202515, "learning_rate": 0.0002, "epoch": 4.821203197307531, "step": 5730}, {"loss": 1.4453, "grad_norm": 0.6881810426712036, "learning_rate": 0.0002, "epoch": 4.829617164493058, "step": 5740}, {"loss": 1.4775, "grad_norm": 0.7838267683982849, "learning_rate": 0.0002, "epoch": 4.838031131678586, "step": 5750}, {"loss": 1.3857, "grad_norm": 0.727799117565155, "learning_rate": 0.0002, "epoch": 4.846445098864114, "step": 5760}, {"loss": 1.4685, "grad_norm": 0.7458277344703674, "learning_rate": 0.0002, "epoch": 4.8548590660496425, "step": 5770}, {"loss": 1.4426, "grad_norm": 0.903802216053009, "learning_rate": 0.0002, "epoch": 4.863273033235171, "step": 5780}, {"loss": 1.451, "grad_norm": 0.7983472347259521, "learning_rate": 0.0002, "epoch": 4.871687000420699, "step": 5790}, {"loss": 1.4534, "grad_norm": 0.6894361972808838, "learning_rate": 0.0002, "epoch": 4.880100967606227, "step": 5800}, {"loss": 1.4486, "grad_norm": 0.7499409317970276, "learning_rate": 0.0002, "epoch": 4.888514934791754, "step": 5810}, {"loss": 1.4253, "grad_norm": 0.7362820506095886, "learning_rate": 0.0002, "epoch": 4.896928901977282, "step": 5820}, {"loss": 1.3763, "grad_norm": 0.8341619968414307, "learning_rate": 0.0002, "epoch": 4.90534286916281, "step": 5830}, {"loss": 1.3748, "grad_norm": 0.9604470133781433, "learning_rate": 0.0002, "epoch": 4.913756836348338, "step": 5840}, {"loss": 1.3658, "grad_norm": 0.8916844129562378, "learning_rate": 0.0002, "epoch": 4.9221708035338665, "step": 5850}, {"loss": 1.363, "grad_norm": 0.8519647121429443, "learning_rate": 0.0002, "epoch": 4.930584770719394, "step": 5860}, {"loss": 1.424, "grad_norm": 0.7946906089782715, "learning_rate": 0.0002, "epoch": 4.938998737904922, "step": 5870}, {"loss": 1.4071, "grad_norm": 0.7843789458274841, "learning_rate": 0.0002, "epoch": 4.94741270509045, "step": 5880}, {"loss": 1.4021, "grad_norm": 0.707618772983551, "learning_rate": 0.0002, "epoch": 4.955826672275978, "step": 5890}, {"loss": 1.502, "grad_norm": 0.7704206109046936, "learning_rate": 0.0002, "epoch": 4.964240639461506, "step": 5900}, {"loss": 1.4456, "grad_norm": 0.7160256505012512, "learning_rate": 0.0002, "epoch": 4.972654606647034, "step": 5910}, {"loss": 1.3874, "grad_norm": 0.7020420432090759, "learning_rate": 0.0002, "epoch": 4.981068573832562, "step": 5920}, {"loss": 1.4037, "grad_norm": 0.7576286792755127, "learning_rate": 0.0002, "epoch": 4.98948254101809, "step": 5930}, {"loss": 1.414, "grad_norm": 0.8573036789894104, "learning_rate": 0.0002, "epoch": 4.997896508203618, "step": 5940}, {"eval_loss": 1.9353811740875244, "eval_runtime": 37.9208, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 4.999579301640724, "step": 5942}, {"loss": 1.2418, "grad_norm": 0.8204267621040344, "learning_rate": 0.0002, "epoch": 5.006310475389146, "step": 5950}, {"loss": 1.235, "grad_norm": 0.976840615272522, "learning_rate": 0.0002, "epoch": 5.014724442574674, "step": 5960}, {"loss": 1.2134, "grad_norm": 0.8765613436698914, "learning_rate": 0.0002, "epoch": 5.023138409760202, "step": 5970}, {"loss": 1.2748, "grad_norm": 1.1793042421340942, "learning_rate": 0.0002, "epoch": 5.03155237694573, "step": 5980}, {"loss": 1.2412, "grad_norm": 0.971062958240509, "learning_rate": 0.0002, "epoch": 5.039966344131258, "step": 5990}, {"loss": 1.1819, "grad_norm": 0.8649757504463196, "learning_rate": 0.0002, "epoch": 5.0483803113167856, "step": 6000}, {"loss": 1.1654, "grad_norm": 0.9563034176826477, "learning_rate": 0.0002, "epoch": 5.056794278502314, "step": 6010}, {"loss": 1.2238, "grad_norm": 1.0093994140625, "learning_rate": 0.0002, "epoch": 5.065208245687842, "step": 6020}, {"loss": 1.2519, "grad_norm": 1.004213571548462, "learning_rate": 0.0002, "epoch": 5.07362221287337, "step": 6030}, {"loss": 1.2379, "grad_norm": 0.8307787179946899, "learning_rate": 0.0002, "epoch": 5.082036180058898, "step": 6040}, {"loss": 1.2282, "grad_norm": 0.9117848873138428, "learning_rate": 0.0002, "epoch": 5.090450147244426, "step": 6050}, {"loss": 1.2582, "grad_norm": 1.0269840955734253, "learning_rate": 0.0002, "epoch": 5.098864114429953, "step": 6060}, {"loss": 1.1836, "grad_norm": 0.9079542756080627, "learning_rate": 0.0002, "epoch": 5.1072780816154815, "step": 6070}, {"loss": 1.215, "grad_norm": 0.885702908039093, "learning_rate": 0.0002, "epoch": 5.11569204880101, "step": 6080}, {"loss": 1.2406, "grad_norm": 0.9976128339767456, "learning_rate": 0.0002, "epoch": 5.124106015986538, "step": 6090}, {"loss": 1.3082, "grad_norm": 0.8472117185592651, "learning_rate": 0.0002, "epoch": 5.132519983172066, "step": 6100}, {"loss": 1.226, "grad_norm": 1.0385161638259888, "learning_rate": 0.0002, "epoch": 5.140933950357594, "step": 6110}, {"loss": 1.213, "grad_norm": 0.8948383927345276, "learning_rate": 0.0002, "epoch": 5.149347917543121, "step": 6120}, {"loss": 1.2213, "grad_norm": 1.2613716125488281, "learning_rate": 0.0002, "epoch": 5.157761884728649, "step": 6130}, {"loss": 1.2632, "grad_norm": 0.9933410286903381, "learning_rate": 0.0002, "epoch": 5.166175851914177, "step": 6140}, {"loss": 1.1715, "grad_norm": 0.9673663973808289, "learning_rate": 0.0002, "epoch": 5.1745898190997055, "step": 6150}, {"loss": 1.2947, "grad_norm": 0.9969648122787476, "learning_rate": 0.0002, "epoch": 5.183003786285234, "step": 6160}, {"loss": 1.2416, "grad_norm": 1.2163258790969849, "learning_rate": 0.0002, "epoch": 5.191417753470762, "step": 6170}, {"loss": 1.2221, "grad_norm": 0.9163419604301453, "learning_rate": 0.0002, "epoch": 5.19983172065629, "step": 6180}, {"loss": 1.2624, "grad_norm": 0.9225585460662842, "learning_rate": 0.0002, "epoch": 5.208245687841817, "step": 6190}, {"loss": 1.2932, "grad_norm": 0.9205296635627747, "learning_rate": 0.0002, "epoch": 5.216659655027345, "step": 6200}, {"loss": 1.1825, "grad_norm": 1.0655443668365479, "learning_rate": 0.0002, "epoch": 5.225073622212873, "step": 6210}, {"loss": 1.2613, "grad_norm": 1.0854865312576294, "learning_rate": 0.0002, "epoch": 5.233487589398401, "step": 6220}, {"loss": 1.3045, "grad_norm": 0.8489186763763428, "learning_rate": 0.0002, "epoch": 5.2419015565839295, "step": 6230}, {"loss": 1.2708, "grad_norm": 0.910391628742218, "learning_rate": 0.0002, "epoch": 5.250315523769458, "step": 6240}, {"loss": 1.1914, "grad_norm": 0.925507128238678, "learning_rate": 0.0002, "epoch": 5.258729490954985, "step": 6250}, {"loss": 1.3368, "grad_norm": 1.1069735288619995, "learning_rate": 0.0002, "epoch": 5.267143458140513, "step": 6260}, {"loss": 1.2505, "grad_norm": 0.9705119132995605, "learning_rate": 0.0002, "epoch": 5.275557425326041, "step": 6270}, {"loss": 1.2602, "grad_norm": 0.9752426147460938, "learning_rate": 0.0002, "epoch": 5.283971392511569, "step": 6280}, {"loss": 1.2043, "grad_norm": 1.021359920501709, "learning_rate": 0.0002, "epoch": 5.292385359697097, "step": 6290}, {"loss": 1.2848, "grad_norm": 1.148606300354004, "learning_rate": 0.0002, "epoch": 5.3007993268826255, "step": 6300}, {"loss": 1.2201, "grad_norm": 0.8909247517585754, "learning_rate": 0.0002, "epoch": 5.309213294068153, "step": 6310}, {"loss": 1.2376, "grad_norm": 0.9879156351089478, "learning_rate": 0.0002, "epoch": 5.317627261253681, "step": 6320}, {"loss": 1.2638, "grad_norm": 0.9473357200622559, "learning_rate": 0.0002, "epoch": 5.326041228439209, "step": 6330}, {"loss": 1.232, "grad_norm": 1.1422028541564941, "learning_rate": 0.0002, "epoch": 5.334455195624737, "step": 6340}, {"loss": 1.263, "grad_norm": 0.9942235350608826, "learning_rate": 0.0002, "epoch": 5.342869162810265, "step": 6350}, {"loss": 1.3032, "grad_norm": 0.9535723924636841, "learning_rate": 0.0002, "epoch": 5.351283129995793, "step": 6360}, {"loss": 1.2908, "grad_norm": 0.9020892381668091, "learning_rate": 0.0002, "epoch": 5.359697097181321, "step": 6370}, {"loss": 1.2023, "grad_norm": 1.0626472234725952, "learning_rate": 0.0002, "epoch": 5.368111064366849, "step": 6380}, {"loss": 1.2555, "grad_norm": 1.1395848989486694, "learning_rate": 0.0002, "epoch": 5.376525031552377, "step": 6390}, {"loss": 1.2839, "grad_norm": 0.9274451732635498, "learning_rate": 0.0002, "epoch": 5.384938998737905, "step": 6400}, {"loss": 1.2819, "grad_norm": 0.8108699917793274, "learning_rate": 0.0002, "epoch": 5.393352965923433, "step": 6410}, {"loss": 1.2589, "grad_norm": 1.1805564165115356, "learning_rate": 0.0002, "epoch": 5.401766933108961, "step": 6420}, {"loss": 1.3549, "grad_norm": 0.8321298360824585, "learning_rate": 0.0002, "epoch": 5.410180900294489, "step": 6430}, {"loss": 1.2925, "grad_norm": 0.8981925249099731, "learning_rate": 0.0002, "epoch": 5.418594867480017, "step": 6440}, {"loss": 1.258, "grad_norm": 1.0730986595153809, "learning_rate": 0.0002, "epoch": 5.4270088346655445, "step": 6450}, {"loss": 1.26, "grad_norm": 1.0584609508514404, "learning_rate": 0.0002, "epoch": 5.435422801851073, "step": 6460}, {"loss": 1.2847, "grad_norm": 1.0792299509048462, "learning_rate": 0.0002, "epoch": 5.443836769036601, "step": 6470}, {"loss": 1.2035, "grad_norm": 0.9101872444152832, "learning_rate": 0.0002, "epoch": 5.452250736222129, "step": 6480}, {"loss": 1.2574, "grad_norm": 0.9910100698471069, "learning_rate": 0.0002, "epoch": 5.460664703407657, "step": 6490}, {"loss": 1.3098, "grad_norm": 1.041412353515625, "learning_rate": 0.0002, "epoch": 5.469078670593185, "step": 6500}, {"loss": 1.2812, "grad_norm": 1.0091687440872192, "learning_rate": 0.0002, "epoch": 5.477492637778712, "step": 6510}, {"loss": 1.2523, "grad_norm": 0.8755383491516113, "learning_rate": 0.0002, "epoch": 5.48590660496424, "step": 6520}, {"loss": 1.3042, "grad_norm": 0.980212390422821, "learning_rate": 0.0002, "epoch": 5.4943205721497685, "step": 6530}, {"loss": 1.2873, "grad_norm": 0.9356869459152222, "learning_rate": 0.0002, "epoch": 5.502734539335297, "step": 6540}, {"loss": 1.2254, "grad_norm": 0.9008095264434814, "learning_rate": 0.0002, "epoch": 5.511148506520825, "step": 6550}, {"loss": 1.2818, "grad_norm": 0.8908938765525818, "learning_rate": 0.0002, "epoch": 5.519562473706353, "step": 6560}, {"loss": 1.2212, "grad_norm": 1.1423932313919067, "learning_rate": 0.0002, "epoch": 5.52797644089188, "step": 6570}, {"loss": 1.3039, "grad_norm": 1.0508161783218384, "learning_rate": 0.0002, "epoch": 5.536390408077408, "step": 6580}, {"loss": 1.2446, "grad_norm": 0.8357517719268799, "learning_rate": 0.0002, "epoch": 5.544804375262936, "step": 6590}, {"loss": 1.3037, "grad_norm": 0.9892540574073792, "learning_rate": 0.0002, "epoch": 5.5532183424484645, "step": 6600}, {"loss": 1.3028, "grad_norm": 1.0048326253890991, "learning_rate": 0.0002, "epoch": 5.561632309633993, "step": 6610}, {"loss": 1.2152, "grad_norm": 0.9801995158195496, "learning_rate": 0.0002, "epoch": 5.570046276819521, "step": 6620}, {"loss": 1.2606, "grad_norm": 0.9899214506149292, "learning_rate": 0.0002, "epoch": 5.578460244005049, "step": 6630}, {"loss": 1.2043, "grad_norm": 1.1911814212799072, "learning_rate": 0.0002, "epoch": 5.586874211190576, "step": 6640}, {"loss": 1.3458, "grad_norm": 1.0368894338607788, "learning_rate": 0.0002, "epoch": 5.595288178376104, "step": 6650}, {"loss": 1.2595, "grad_norm": 1.1248382329940796, "learning_rate": 0.0002, "epoch": 5.603702145561632, "step": 6660}, {"loss": 1.2548, "grad_norm": 0.9765539765357971, "learning_rate": 0.0002, "epoch": 5.61211611274716, "step": 6670}, {"loss": 1.3451, "grad_norm": 0.9810206890106201, "learning_rate": 0.0002, "epoch": 5.6205300799326885, "step": 6680}, {"loss": 1.2952, "grad_norm": 1.100386619567871, "learning_rate": 0.0002, "epoch": 5.628944047118217, "step": 6690}, {"loss": 1.2467, "grad_norm": 0.8824519515037537, "learning_rate": 0.0002, "epoch": 5.637358014303744, "step": 6700}, {"loss": 1.25, "grad_norm": 1.0864064693450928, "learning_rate": 0.0002, "epoch": 5.645771981489272, "step": 6710}, {"loss": 1.2479, "grad_norm": 1.1614511013031006, "learning_rate": 0.0002, "epoch": 5.6541859486748, "step": 6720}, {"loss": 1.2753, "grad_norm": 1.0762972831726074, "learning_rate": 0.0002, "epoch": 5.662599915860328, "step": 6730}, {"loss": 1.2741, "grad_norm": 0.9408974647521973, "learning_rate": 0.0002, "epoch": 5.671013883045856, "step": 6740}, {"loss": 1.2431, "grad_norm": 0.8906030058860779, "learning_rate": 0.0002, "epoch": 5.679427850231384, "step": 6750}, {"loss": 1.2643, "grad_norm": 0.9527303576469421, "learning_rate": 0.0002, "epoch": 5.687841817416912, "step": 6760}, {"loss": 1.322, "grad_norm": 0.9471196532249451, "learning_rate": 0.0002, "epoch": 5.69625578460244, "step": 6770}, {"loss": 1.2514, "grad_norm": 0.9186838865280151, "learning_rate": 0.0002, "epoch": 5.704669751787968, "step": 6780}, {"loss": 1.2347, "grad_norm": 0.9225441813468933, "learning_rate": 0.0002, "epoch": 5.713083718973496, "step": 6790}, {"loss": 1.1849, "grad_norm": 0.9712982773780823, "learning_rate": 0.0002, "epoch": 5.721497686159024, "step": 6800}, {"loss": 1.2431, "grad_norm": 1.0743170976638794, "learning_rate": 0.0002, "epoch": 5.729911653344552, "step": 6810}, {"loss": 1.2136, "grad_norm": 1.2738113403320312, "learning_rate": 0.0002, "epoch": 5.73832562053008, "step": 6820}, {"loss": 1.2176, "grad_norm": 0.9386790990829468, "learning_rate": 0.0002, "epoch": 5.7467395877156076, "step": 6830}, {"loss": 1.285, "grad_norm": 1.0817769765853882, "learning_rate": 0.0002, "epoch": 5.755153554901136, "step": 6840}, {"loss": 1.2247, "grad_norm": 1.1040263175964355, "learning_rate": 0.0002, "epoch": 5.763567522086664, "step": 6850}, {"loss": 1.2507, "grad_norm": 1.0656492710113525, "learning_rate": 0.0002, "epoch": 5.771981489272192, "step": 6860}, {"loss": 1.2999, "grad_norm": 0.9550157189369202, "learning_rate": 0.0002, "epoch": 5.78039545645772, "step": 6870}, {"loss": 1.3201, "grad_norm": 1.0130870342254639, "learning_rate": 0.0002, "epoch": 5.788809423643248, "step": 6880}, {"loss": 1.3392, "grad_norm": 1.0675787925720215, "learning_rate": 0.0002, "epoch": 5.797223390828776, "step": 6890}, {"loss": 1.2949, "grad_norm": 0.9537774920463562, "learning_rate": 0.0002, "epoch": 5.8056373580143035, "step": 6900}, {"loss": 1.2658, "grad_norm": 0.9640319347381592, "learning_rate": 0.0002, "epoch": 5.814051325199832, "step": 6910}, {"loss": 1.2199, "grad_norm": 0.8917992115020752, "learning_rate": 0.0002, "epoch": 5.82246529238536, "step": 6920}, {"loss": 1.373, "grad_norm": 0.9881822466850281, "learning_rate": 0.0002, "epoch": 5.830879259570888, "step": 6930}, {"loss": 1.323, "grad_norm": 0.9136882424354553, "learning_rate": 0.0002, "epoch": 5.839293226756416, "step": 6940}, {"loss": 1.3159, "grad_norm": 0.9086098074913025, "learning_rate": 0.0002, "epoch": 5.847707193941943, "step": 6950}, {"loss": 1.2624, "grad_norm": 0.9443018436431885, "learning_rate": 0.0002, "epoch": 5.856121161127471, "step": 6960}, {"loss": 1.3224, "grad_norm": 0.9915381669998169, "learning_rate": 0.0002, "epoch": 5.864535128312999, "step": 6970}, {"loss": 1.337, "grad_norm": 0.8939146995544434, "learning_rate": 0.0002, "epoch": 5.8729490954985275, "step": 6980}, {"loss": 1.2611, "grad_norm": 1.3672245740890503, "learning_rate": 0.0002, "epoch": 5.881363062684056, "step": 6990}, {"loss": 1.3012, "grad_norm": 1.0116257667541504, "learning_rate": 0.0002, "epoch": 5.889777029869584, "step": 7000}, {"loss": 1.3128, "grad_norm": 1.1561565399169922, "learning_rate": 0.0002, "epoch": 5.898190997055112, "step": 7010}, {"loss": 1.2301, "grad_norm": 0.9900678992271423, "learning_rate": 0.0002, "epoch": 5.906604964240639, "step": 7020}, {"loss": 1.2845, "grad_norm": 0.9297345876693726, "learning_rate": 0.0002, "epoch": 5.915018931426167, "step": 7030}, {"loss": 1.2317, "grad_norm": 0.9357825517654419, "learning_rate": 0.0002, "epoch": 5.923432898611695, "step": 7040}, {"loss": 1.2303, "grad_norm": 1.049317717552185, "learning_rate": 0.0002, "epoch": 5.931846865797223, "step": 7050}, {"loss": 1.3243, "grad_norm": 0.950633704662323, "learning_rate": 0.0002, "epoch": 5.9402608329827515, "step": 7060}, {"loss": 1.2758, "grad_norm": 0.854581892490387, "learning_rate": 0.0002, "epoch": 5.94867480016828, "step": 7070}, {"loss": 1.3252, "grad_norm": 0.9097039699554443, "learning_rate": 0.0002, "epoch": 5.957088767353808, "step": 7080}, {"loss": 1.291, "grad_norm": 0.9072173237800598, "learning_rate": 0.0002, "epoch": 5.965502734539335, "step": 7090}, {"loss": 1.2724, "grad_norm": 1.0470727682113647, "learning_rate": 0.0002, "epoch": 5.973916701724863, "step": 7100}, {"loss": 1.3324, "grad_norm": 1.2628462314605713, "learning_rate": 0.0002, "epoch": 5.982330668910391, "step": 7110}, {"loss": 1.2701, "grad_norm": 1.055279016494751, "learning_rate": 0.0002, "epoch": 5.990744636095919, "step": 7120}, {"loss": 1.3234, "grad_norm": 0.966194212436676, "learning_rate": 0.0002, "epoch": 5.9991586032814475, "step": 7130}, {"eval_loss": 2.0427448749542236, "eval_runtime": 37.8426, "eval_samples_per_second": 13.609, "eval_steps_per_second": 1.718, "epoch": 6.0, "step": 7131}, {"loss": 1.1308, "grad_norm": 1.4037928581237793, "learning_rate": 0.0002, "epoch": 6.007572570466976, "step": 7140}, {"loss": 1.047, "grad_norm": 1.1081010103225708, "learning_rate": 0.0002, "epoch": 6.015986537652503, "step": 7150}, {"loss": 1.1368, "grad_norm": 1.1585499048233032, "learning_rate": 0.0002, "epoch": 6.024400504838031, "step": 7160}, {"loss": 1.0192, "grad_norm": 1.0822780132293701, "learning_rate": 0.0002, "epoch": 6.032814472023559, "step": 7170}, {"loss": 1.0755, "grad_norm": 0.9662094712257385, "learning_rate": 0.0002, "epoch": 6.041228439209087, "step": 7180}, {"loss": 1.1366, "grad_norm": 1.063936710357666, "learning_rate": 0.0002, "epoch": 6.049642406394615, "step": 7190}, {"loss": 1.0121, "grad_norm": 1.0349032878875732, "learning_rate": 0.0002, "epoch": 6.058056373580143, "step": 7200}, {"loss": 1.0591, "grad_norm": 1.0312575101852417, "learning_rate": 0.0002, "epoch": 6.066470340765671, "step": 7210}, {"loss": 1.1824, "grad_norm": 1.1942846775054932, "learning_rate": 0.0002, "epoch": 6.074884307951199, "step": 7220}, {"loss": 1.1034, "grad_norm": 1.0816049575805664, "learning_rate": 0.0002, "epoch": 6.083298275136727, "step": 7230}, {"loss": 1.0859, "grad_norm": 0.9985513687133789, "learning_rate": 0.0002, "epoch": 6.091712242322255, "step": 7240}, {"loss": 1.0367, "grad_norm": 1.2573972940444946, "learning_rate": 0.0002, "epoch": 6.100126209507783, "step": 7250}, {"loss": 1.1051, "grad_norm": 1.1182395219802856, "learning_rate": 0.0002, "epoch": 6.108540176693311, "step": 7260}, {"loss": 1.1219, "grad_norm": 0.9679344296455383, "learning_rate": 0.0002, "epoch": 6.116954143878839, "step": 7270}, {"loss": 1.1192, "grad_norm": 1.0913981199264526, "learning_rate": 0.0002, "epoch": 6.1253681110643665, "step": 7280}, {"loss": 1.0411, "grad_norm": 1.1291013956069946, "learning_rate": 0.0002, "epoch": 6.133782078249895, "step": 7290}, {"loss": 1.0963, "grad_norm": 1.2679595947265625, "learning_rate": 0.0002, "epoch": 6.142196045435423, "step": 7300}, {"loss": 1.0875, "grad_norm": 1.2350026369094849, "learning_rate": 0.0002, "epoch": 6.150610012620951, "step": 7310}, {"loss": 1.1139, "grad_norm": 1.3213104009628296, "learning_rate": 0.0002, "epoch": 6.159023979806479, "step": 7320}, {"loss": 1.1167, "grad_norm": 1.1924850940704346, "learning_rate": 0.0002, "epoch": 6.167437946992007, "step": 7330}, {"loss": 1.1242, "grad_norm": 1.1890000104904175, "learning_rate": 0.0002, "epoch": 6.175851914177534, "step": 7340}, {"loss": 1.1341, "grad_norm": 1.3821455240249634, "learning_rate": 0.0002, "epoch": 6.184265881363062, "step": 7350}, {"loss": 1.0748, "grad_norm": 1.1217057704925537, "learning_rate": 0.0002, "epoch": 6.1926798485485905, "step": 7360}, {"loss": 1.159, "grad_norm": 1.2441548109054565, "learning_rate": 0.0002, "epoch": 6.201093815734119, "step": 7370}, {"loss": 1.1199, "grad_norm": 1.0837615728378296, "learning_rate": 0.0002, "epoch": 6.209507782919647, "step": 7380}, {"loss": 1.1641, "grad_norm": 1.164304256439209, "learning_rate": 0.0002, "epoch": 6.217921750105175, "step": 7390}, {"loss": 1.1325, "grad_norm": 1.3129467964172363, "learning_rate": 0.0002, "epoch": 6.226335717290702, "step": 7400}, {"loss": 1.1537, "grad_norm": 1.1938153505325317, "learning_rate": 0.0002, "epoch": 6.23474968447623, "step": 7410}, {"loss": 1.1238, "grad_norm": 1.4348443746566772, "learning_rate": 0.0002, "epoch": 6.243163651661758, "step": 7420}, {"loss": 1.0778, "grad_norm": 1.132301926612854, "learning_rate": 0.0002, "epoch": 6.2515776188472865, "step": 7430}, {"loss": 1.1148, "grad_norm": 1.136966586112976, "learning_rate": 0.0002, "epoch": 6.259991586032815, "step": 7440}, {"loss": 1.096, "grad_norm": 1.12801194190979, "learning_rate": 0.0002, "epoch": 6.268405553218343, "step": 7450}, {"loss": 1.0408, "grad_norm": 1.0246902704238892, "learning_rate": 0.0002, "epoch": 6.276819520403871, "step": 7460}, {"loss": 1.0389, "grad_norm": 1.1066974401474, "learning_rate": 0.0002, "epoch": 6.285233487589398, "step": 7470}, {"loss": 1.1589, "grad_norm": 1.012710690498352, "learning_rate": 0.0002, "epoch": 6.293647454774926, "step": 7480}, {"loss": 1.1049, "grad_norm": 1.2227119207382202, "learning_rate": 0.0002, "epoch": 6.302061421960454, "step": 7490}, {"loss": 1.1376, "grad_norm": 0.9736923575401306, "learning_rate": 0.0002, "epoch": 6.310475389145982, "step": 7500}, {"loss": 1.1017, "grad_norm": 1.2945268154144287, "learning_rate": 0.0002, "epoch": 6.3188893563315105, "step": 7510}, {"loss": 1.0724, "grad_norm": 1.1579312086105347, "learning_rate": 0.0002, "epoch": 6.327303323517039, "step": 7520}, {"loss": 1.0899, "grad_norm": 1.2404558658599854, "learning_rate": 0.0002, "epoch": 6.335717290702567, "step": 7530}, {"loss": 1.1635, "grad_norm": 1.4673258066177368, "learning_rate": 0.0002, "epoch": 6.344131257888094, "step": 7540}, {"loss": 1.128, "grad_norm": 1.2268997430801392, "learning_rate": 0.0002, "epoch": 6.352545225073622, "step": 7550}, {"loss": 1.0932, "grad_norm": 0.9772747159004211, "learning_rate": 0.0002, "epoch": 6.36095919225915, "step": 7560}, {"loss": 1.1214, "grad_norm": 1.0205204486846924, "learning_rate": 0.0002, "epoch": 6.369373159444678, "step": 7570}, {"loss": 1.1095, "grad_norm": 1.2227109670639038, "learning_rate": 0.0002, "epoch": 6.377787126630206, "step": 7580}, {"loss": 1.1115, "grad_norm": 1.0708507299423218, "learning_rate": 0.0002, "epoch": 6.3862010938157345, "step": 7590}, {"loss": 1.1018, "grad_norm": 1.1427522897720337, "learning_rate": 0.0002, "epoch": 6.394615061001262, "step": 7600}, {"loss": 1.1079, "grad_norm": 1.0706431865692139, "learning_rate": 0.0002, "epoch": 6.40302902818679, "step": 7610}, {"loss": 1.0933, "grad_norm": 1.1358282566070557, "learning_rate": 0.0002, "epoch": 6.411442995372318, "step": 7620}, {"loss": 1.1075, "grad_norm": 1.4011822938919067, "learning_rate": 0.0002, "epoch": 6.419856962557846, "step": 7630}, {"loss": 1.1269, "grad_norm": 1.5616450309753418, "learning_rate": 0.0002, "epoch": 6.428270929743374, "step": 7640}, {"loss": 1.0953, "grad_norm": 1.1442687511444092, "learning_rate": 0.0002, "epoch": 6.436684896928902, "step": 7650}, {"loss": 1.1341, "grad_norm": 1.164803147315979, "learning_rate": 0.0002, "epoch": 6.44509886411443, "step": 7660}, {"loss": 1.14, "grad_norm": 1.3184553384780884, "learning_rate": 0.0002, "epoch": 6.453512831299958, "step": 7670}, {"loss": 1.1526, "grad_norm": 1.2701894044876099, "learning_rate": 0.0002, "epoch": 6.461926798485486, "step": 7680}, {"loss": 1.2119, "grad_norm": 1.1998416185379028, "learning_rate": 0.0002, "epoch": 6.470340765671014, "step": 7690}, {"loss": 1.1528, "grad_norm": 1.156459927558899, "learning_rate": 0.0002, "epoch": 6.478754732856542, "step": 7700}, {"loss": 1.2122, "grad_norm": 1.0217190980911255, "learning_rate": 0.0002, "epoch": 6.48716870004207, "step": 7710}, {"loss": 1.0917, "grad_norm": 1.230372428894043, "learning_rate": 0.0002, "epoch": 6.495582667227598, "step": 7720}, {"loss": 1.119, "grad_norm": 1.105675220489502, "learning_rate": 0.0002, "epoch": 6.5039966344131255, "step": 7730}, {"loss": 1.0758, "grad_norm": 1.1623669862747192, "learning_rate": 0.0002, "epoch": 6.512410601598654, "step": 7740}, {"loss": 1.1548, "grad_norm": 1.2884684801101685, "learning_rate": 0.0002, "epoch": 6.520824568784182, "step": 7750}, {"loss": 1.142, "grad_norm": 1.1785279512405396, "learning_rate": 0.0002, "epoch": 6.52923853596971, "step": 7760}, {"loss": 1.1598, "grad_norm": 1.0607101917266846, "learning_rate": 0.0002, "epoch": 6.537652503155238, "step": 7770}, {"loss": 1.1472, "grad_norm": 1.21990168094635, "learning_rate": 0.0002, "epoch": 6.546066470340766, "step": 7780}, {"loss": 1.1468, "grad_norm": 1.1498621702194214, "learning_rate": 0.0002, "epoch": 6.554480437526293, "step": 7790}, {"loss": 1.1847, "grad_norm": 1.263929009437561, "learning_rate": 0.0002, "epoch": 6.562894404711821, "step": 7800}, {"loss": 1.1177, "grad_norm": 1.1580625772476196, "learning_rate": 0.0002, "epoch": 6.5713083718973495, "step": 7810}, {"loss": 1.1313, "grad_norm": 1.4431294202804565, "learning_rate": 0.0002, "epoch": 6.579722339082878, "step": 7820}, {"loss": 1.1944, "grad_norm": 1.1309990882873535, "learning_rate": 0.0002, "epoch": 6.588136306268406, "step": 7830}, {"loss": 1.1156, "grad_norm": 1.0543386936187744, "learning_rate": 0.0002, "epoch": 6.596550273453934, "step": 7840}, {"loss": 1.0945, "grad_norm": 1.2180639505386353, "learning_rate": 0.0002, "epoch": 6.604964240639461, "step": 7850}, {"loss": 1.1318, "grad_norm": 1.0631271600723267, "learning_rate": 0.0002, "epoch": 6.613378207824989, "step": 7860}, {"loss": 1.1792, "grad_norm": 1.138885498046875, "learning_rate": 0.0002, "epoch": 6.621792175010517, "step": 7870}, {"loss": 1.1805, "grad_norm": 1.1117745637893677, "learning_rate": 0.0002, "epoch": 6.630206142196045, "step": 7880}, {"loss": 1.15, "grad_norm": 1.3734886646270752, "learning_rate": 0.0002, "epoch": 6.6386201093815735, "step": 7890}, {"loss": 1.1584, "grad_norm": 1.236003041267395, "learning_rate": 0.0002, "epoch": 6.647034076567102, "step": 7900}, {"loss": 1.1718, "grad_norm": 1.2206000089645386, "learning_rate": 0.0002, "epoch": 6.65544804375263, "step": 7910}, {"loss": 1.1637, "grad_norm": 1.2842656373977661, "learning_rate": 0.0002, "epoch": 6.663862010938157, "step": 7920}, {"loss": 1.2219, "grad_norm": 1.2365005016326904, "learning_rate": 0.0002, "epoch": 6.672275978123685, "step": 7930}, {"loss": 1.0827, "grad_norm": 1.256620168685913, "learning_rate": 0.0002, "epoch": 6.680689945309213, "step": 7940}, {"loss": 1.1788, "grad_norm": 1.3232917785644531, "learning_rate": 0.0002, "epoch": 6.689103912494741, "step": 7950}, {"loss": 1.2042, "grad_norm": 1.2470088005065918, "learning_rate": 0.0002, "epoch": 6.6975178796802695, "step": 7960}, {"loss": 1.0959, "grad_norm": 1.0511926412582397, "learning_rate": 0.0002, "epoch": 6.705931846865798, "step": 7970}, {"loss": 1.118, "grad_norm": 1.107310175895691, "learning_rate": 0.0002, "epoch": 6.714345814051326, "step": 7980}, {"loss": 1.2109, "grad_norm": 1.4069843292236328, "learning_rate": 0.0002, "epoch": 6.722759781236853, "step": 7990}, {"loss": 1.1298, "grad_norm": 1.0800836086273193, "learning_rate": 0.0002, "epoch": 6.731173748422381, "step": 8000}, {"loss": 1.1824, "grad_norm": 1.1676300764083862, "learning_rate": 0.0002, "epoch": 6.739587715607909, "step": 8010}, {"loss": 1.1253, "grad_norm": 1.0579663515090942, "learning_rate": 0.0002, "epoch": 6.748001682793437, "step": 8020}, {"loss": 1.1542, "grad_norm": 1.2770029306411743, "learning_rate": 0.0002, "epoch": 6.756415649978965, "step": 8030}, {"loss": 1.1519, "grad_norm": 1.0981038808822632, "learning_rate": 0.0002, "epoch": 6.764829617164493, "step": 8040}, {"loss": 1.1422, "grad_norm": 1.1194742918014526, "learning_rate": 0.0002, "epoch": 6.773243584350021, "step": 8050}, {"loss": 1.1463, "grad_norm": 1.0130012035369873, "learning_rate": 0.0002, "epoch": 6.781657551535549, "step": 8060}, {"loss": 1.2008, "grad_norm": 1.2051167488098145, "learning_rate": 0.0002, "epoch": 6.790071518721077, "step": 8070}, {"loss": 1.142, "grad_norm": 1.095689058303833, "learning_rate": 0.0002, "epoch": 6.798485485906605, "step": 8080}, {"loss": 1.1352, "grad_norm": 1.2275174856185913, "learning_rate": 0.0002, "epoch": 6.806899453092133, "step": 8090}, {"loss": 1.1453, "grad_norm": 1.1439805030822754, "learning_rate": 0.0002, "epoch": 6.815313420277661, "step": 8100}, {"loss": 1.1624, "grad_norm": 1.276331901550293, "learning_rate": 0.0002, "epoch": 6.8237273874631885, "step": 8110}, {"loss": 1.1686, "grad_norm": 1.0450139045715332, "learning_rate": 0.0002, "epoch": 6.832141354648717, "step": 8120}, {"loss": 1.1783, "grad_norm": 1.1189453601837158, "learning_rate": 0.0002, "epoch": 6.840555321834245, "step": 8130}, {"loss": 1.1093, "grad_norm": 1.194640874862671, "learning_rate": 0.0002, "epoch": 6.848969289019773, "step": 8140}, {"loss": 1.1559, "grad_norm": 1.095372200012207, "learning_rate": 0.0002, "epoch": 6.857383256205301, "step": 8150}, {"loss": 1.165, "grad_norm": 1.2416104078292847, "learning_rate": 0.0002, "epoch": 6.865797223390829, "step": 8160}, {"loss": 1.2174, "grad_norm": 1.2402868270874023, "learning_rate": 0.0002, "epoch": 6.874211190576357, "step": 8170}, {"loss": 1.1306, "grad_norm": 1.1317291259765625, "learning_rate": 0.0002, "epoch": 6.882625157761884, "step": 8180}, {"loss": 1.1944, "grad_norm": 1.0581914186477661, "learning_rate": 0.0002, "epoch": 6.8910391249474126, "step": 8190}, {"loss": 1.1271, "grad_norm": 1.3540890216827393, "learning_rate": 0.0002, "epoch": 6.899453092132941, "step": 8200}, {"loss": 1.2119, "grad_norm": 1.213672399520874, "learning_rate": 0.0002, "epoch": 6.907867059318469, "step": 8210}, {"loss": 1.1406, "grad_norm": 1.2654485702514648, "learning_rate": 0.0002, "epoch": 6.916281026503997, "step": 8220}, {"loss": 1.205, "grad_norm": 1.203903317451477, "learning_rate": 0.0002, "epoch": 6.924694993689524, "step": 8230}, {"loss": 1.1635, "grad_norm": 1.1332030296325684, "learning_rate": 0.0002, "epoch": 6.933108960875052, "step": 8240}, {"loss": 1.1148, "grad_norm": 1.2699192762374878, "learning_rate": 0.0002, "epoch": 6.94152292806058, "step": 8250}, {"loss": 1.1831, "grad_norm": 1.2728958129882812, "learning_rate": 0.0002, "epoch": 6.9499368952461085, "step": 8260}, {"loss": 1.1757, "grad_norm": 1.238410472869873, "learning_rate": 0.0002, "epoch": 6.958350862431637, "step": 8270}, {"loss": 1.1499, "grad_norm": 1.403863549232483, "learning_rate": 0.0002, "epoch": 6.966764829617165, "step": 8280}, {"loss": 1.1515, "grad_norm": 1.1096396446228027, "learning_rate": 0.0002, "epoch": 6.975178796802693, "step": 8290}, {"loss": 1.2049, "grad_norm": 1.1043379306793213, "learning_rate": 0.0002, "epoch": 6.98359276398822, "step": 8300}, {"loss": 1.1255, "grad_norm": 1.391754388809204, "learning_rate": 0.0002, "epoch": 6.992006731173748, "step": 8310}, {"eval_loss": 2.1421656608581543, "eval_runtime": 37.8262, "eval_samples_per_second": 13.615, "eval_steps_per_second": 1.718, "epoch": 6.999579301640724, "step": 8319}, {"loss": 1.1107, "grad_norm": 1.1739230155944824, "learning_rate": 0.0002, "epoch": 7.000420698359276, "step": 8320}, {"loss": 1.0066, "grad_norm": 1.5428645610809326, "learning_rate": 0.0002, "epoch": 7.008834665544804, "step": 8330}, {"loss": 0.9885, "grad_norm": 1.307463526725769, "learning_rate": 0.0002, "epoch": 7.0172486327303325, "step": 8340}, {"loss": 0.9098, "grad_norm": 1.4964789152145386, "learning_rate": 0.0002, "epoch": 7.025662599915861, "step": 8350}, {"loss": 0.8976, "grad_norm": 1.2289477586746216, "learning_rate": 0.0002, "epoch": 7.034076567101389, "step": 8360}, {"loss": 0.9254, "grad_norm": 1.325327754020691, "learning_rate": 0.0002, "epoch": 7.042490534286916, "step": 8370}, {"loss": 0.8967, "grad_norm": 1.4672988653182983, "learning_rate": 0.0002, "epoch": 7.050904501472444, "step": 8380}, {"loss": 0.8927, "grad_norm": 1.4184634685516357, "learning_rate": 0.0002, "epoch": 7.059318468657972, "step": 8390}, {"loss": 0.9129, "grad_norm": 1.3103536367416382, "learning_rate": 0.0002, "epoch": 7.0677324358435, "step": 8400}, {"loss": 0.997, "grad_norm": 1.2364518642425537, "learning_rate": 0.0002, "epoch": 7.076146403029028, "step": 8410}, {"loss": 0.8776, "grad_norm": 1.3712464570999146, "learning_rate": 0.0002, "epoch": 7.0845603702145565, "step": 8420}, {"loss": 0.9685, "grad_norm": 1.4655892848968506, "learning_rate": 0.0002, "epoch": 7.092974337400084, "step": 8430}, {"loss": 0.9276, "grad_norm": 1.3276227712631226, "learning_rate": 0.0002, "epoch": 7.101388304585612, "step": 8440}, {"loss": 0.9695, "grad_norm": 1.1355878114700317, "learning_rate": 0.0002, "epoch": 7.10980227177114, "step": 8450}, {"loss": 0.9673, "grad_norm": 1.2767117023468018, "learning_rate": 0.0002, "epoch": 7.118216238956668, "step": 8460}, {"loss": 0.9296, "grad_norm": 1.4915258884429932, "learning_rate": 0.0002, "epoch": 7.126630206142196, "step": 8470}, {"loss": 0.9469, "grad_norm": 1.355043649673462, "learning_rate": 0.0002, "epoch": 7.135044173327724, "step": 8480}, {"loss": 0.9626, "grad_norm": 1.0848617553710938, "learning_rate": 0.0002, "epoch": 7.143458140513252, "step": 8490}, {"loss": 0.92, "grad_norm": 1.5321701765060425, "learning_rate": 0.0002, "epoch": 7.15187210769878, "step": 8500}, {"loss": 0.9787, "grad_norm": 1.4917421340942383, "learning_rate": 0.0002, "epoch": 7.160286074884308, "step": 8510}, {"loss": 0.9709, "grad_norm": 1.4249778985977173, "learning_rate": 0.0002, "epoch": 7.168700042069836, "step": 8520}, {"loss": 0.9023, "grad_norm": 1.5257216691970825, "learning_rate": 0.0002, "epoch": 7.177114009255364, "step": 8530}, {"loss": 0.9818, "grad_norm": 1.4094327688217163, "learning_rate": 0.0002, "epoch": 7.185527976440892, "step": 8540}, {"loss": 0.9676, "grad_norm": 1.5506917238235474, "learning_rate": 0.0002, "epoch": 7.19394194362642, "step": 8550}, {"loss": 1.0494, "grad_norm": 1.336599588394165, "learning_rate": 0.0002, "epoch": 7.2023559108119475, "step": 8560}, {"loss": 0.9902, "grad_norm": 1.2018364667892456, "learning_rate": 0.0002, "epoch": 7.210769877997476, "step": 8570}, {"loss": 0.9329, "grad_norm": 1.198525071144104, "learning_rate": 0.0002, "epoch": 7.219183845183004, "step": 8580}, {"loss": 0.8954, "grad_norm": 1.4427133798599243, "learning_rate": 0.0002, "epoch": 7.227597812368532, "step": 8590}, {"loss": 0.9827, "grad_norm": 1.3134386539459229, "learning_rate": 0.0002, "epoch": 7.23601177955406, "step": 8600}, {"loss": 1.025, "grad_norm": 1.4141706228256226, "learning_rate": 0.0002, "epoch": 7.244425746739588, "step": 8610}, {"loss": 1.023, "grad_norm": 1.4951153993606567, "learning_rate": 0.0002, "epoch": 7.252839713925115, "step": 8620}, {"loss": 0.9595, "grad_norm": 1.383599042892456, "learning_rate": 0.0002, "epoch": 7.261253681110643, "step": 8630}, {"loss": 0.9775, "grad_norm": 1.2315951585769653, "learning_rate": 0.0002, "epoch": 7.2696676482961715, "step": 8640}, {"loss": 0.9946, "grad_norm": 1.253337025642395, "learning_rate": 0.0002, "epoch": 7.2780816154817, "step": 8650}, {"loss": 1.0381, "grad_norm": 1.2234476804733276, "learning_rate": 0.0002, "epoch": 7.286495582667228, "step": 8660}, {"loss": 0.9774, "grad_norm": 1.395650863647461, "learning_rate": 0.0002, "epoch": 7.294909549852756, "step": 8670}, {"loss": 0.9234, "grad_norm": 1.2411445379257202, "learning_rate": 0.0002, "epoch": 7.303323517038283, "step": 8680}, {"loss": 0.975, "grad_norm": 1.22808837890625, "learning_rate": 0.0002, "epoch": 7.311737484223811, "step": 8690}, {"loss": 0.9808, "grad_norm": 1.5197114944458008, "learning_rate": 0.0002, "epoch": 7.320151451409339, "step": 8700}, {"loss": 0.96, "grad_norm": 1.3072681427001953, "learning_rate": 0.0002, "epoch": 7.328565418594867, "step": 8710}, {"loss": 0.9386, "grad_norm": 1.3035615682601929, "learning_rate": 0.0002, "epoch": 7.3369793857803955, "step": 8720}, {"loss": 0.9666, "grad_norm": 1.2765713930130005, "learning_rate": 0.0002, "epoch": 7.345393352965924, "step": 8730}, {"loss": 0.9581, "grad_norm": 1.419601321220398, "learning_rate": 0.0002, "epoch": 7.353807320151452, "step": 8740}, {"loss": 1.0378, "grad_norm": 1.376158595085144, "learning_rate": 0.0002, "epoch": 7.362221287336979, "step": 8750}, {"loss": 0.9947, "grad_norm": 1.3880754709243774, "learning_rate": 0.0002, "epoch": 7.370635254522507, "step": 8760}, {"loss": 1.0512, "grad_norm": 1.2978262901306152, "learning_rate": 0.0002, "epoch": 7.379049221708035, "step": 8770}, {"loss": 1.0312, "grad_norm": 1.5811840295791626, "learning_rate": 0.0002, "epoch": 7.387463188893563, "step": 8780}, {"loss": 0.9977, "grad_norm": 1.3790863752365112, "learning_rate": 0.0002, "epoch": 7.3958771560790915, "step": 8790}, {"loss": 1.008, "grad_norm": 1.475306510925293, "learning_rate": 0.0002, "epoch": 7.40429112326462, "step": 8800}, {"loss": 0.9752, "grad_norm": 1.1038212776184082, "learning_rate": 0.0002, "epoch": 7.412705090450148, "step": 8810}, {"loss": 1.0048, "grad_norm": 1.5204451084136963, "learning_rate": 0.0002, "epoch": 7.421119057635675, "step": 8820}, {"loss": 1.019, "grad_norm": 1.7151343822479248, "learning_rate": 0.0002, "epoch": 7.429533024821203, "step": 8830}, {"loss": 1.0038, "grad_norm": 1.128046989440918, "learning_rate": 0.0002, "epoch": 7.437946992006731, "step": 8840}, {"loss": 1.0377, "grad_norm": 1.5780670642852783, "learning_rate": 0.0002, "epoch": 7.446360959192259, "step": 8850}, {"loss": 1.0584, "grad_norm": 1.3571979999542236, "learning_rate": 0.0002, "epoch": 7.454774926377787, "step": 8860}, {"loss": 1.0141, "grad_norm": 1.2764537334442139, "learning_rate": 0.0002, "epoch": 7.4631888935633155, "step": 8870}, {"loss": 0.9982, "grad_norm": 1.3429038524627686, "learning_rate": 0.0002, "epoch": 7.471602860748843, "step": 8880}, {"loss": 0.9671, "grad_norm": 1.3288369178771973, "learning_rate": 0.0002, "epoch": 7.480016827934371, "step": 8890}, {"loss": 0.9461, "grad_norm": 1.360141396522522, "learning_rate": 0.0002, "epoch": 7.488430795119899, "step": 8900}, {"loss": 1.0278, "grad_norm": 1.31229829788208, "learning_rate": 0.0002, "epoch": 7.496844762305427, "step": 8910}, {"loss": 0.9945, "grad_norm": 1.530605435371399, "learning_rate": 0.0002, "epoch": 7.505258729490955, "step": 8920}, {"loss": 1.0442, "grad_norm": 1.2880185842514038, "learning_rate": 0.0002, "epoch": 7.513672696676483, "step": 8930}, {"loss": 0.9859, "grad_norm": 1.3219470977783203, "learning_rate": 0.0002, "epoch": 7.5220866638620105, "step": 8940}, {"loss": 1.0664, "grad_norm": 1.565633773803711, "learning_rate": 0.0002, "epoch": 7.530500631047539, "step": 8950}, {"loss": 1.0089, "grad_norm": 1.4392317533493042, "learning_rate": 0.0002, "epoch": 7.538914598233067, "step": 8960}, {"loss": 1.0214, "grad_norm": 1.4557991027832031, "learning_rate": 0.0002, "epoch": 7.547328565418595, "step": 8970}, {"loss": 1.0247, "grad_norm": 1.3411110639572144, "learning_rate": 0.0002, "epoch": 7.555742532604123, "step": 8980}, {"loss": 1.0738, "grad_norm": 1.333378791809082, "learning_rate": 0.0002, "epoch": 7.564156499789651, "step": 8990}, {"loss": 1.0429, "grad_norm": 1.4422006607055664, "learning_rate": 0.0002, "epoch": 7.572570466975179, "step": 9000}, {"loss": 1.0401, "grad_norm": 1.2519633769989014, "learning_rate": 0.0002, "epoch": 7.580984434160706, "step": 9010}, {"loss": 1.0028, "grad_norm": 1.3628246784210205, "learning_rate": 0.0002, "epoch": 7.589398401346235, "step": 9020}, {"loss": 0.9883, "grad_norm": 1.35457181930542, "learning_rate": 0.0002, "epoch": 7.597812368531763, "step": 9030}, {"loss": 0.9929, "grad_norm": 1.4441956281661987, "learning_rate": 0.0002, "epoch": 7.606226335717291, "step": 9040}, {"loss": 0.9987, "grad_norm": 1.3812335729599, "learning_rate": 0.0002, "epoch": 7.614640302902819, "step": 9050}, {"loss": 0.9692, "grad_norm": 1.3576860427856445, "learning_rate": 0.0002, "epoch": 7.623054270088347, "step": 9060}, {"loss": 1.0259, "grad_norm": 1.350433588027954, "learning_rate": 0.0002, "epoch": 7.631468237273874, "step": 9070}, {"loss": 1.0292, "grad_norm": 1.3413814306259155, "learning_rate": 0.0002, "epoch": 7.639882204459402, "step": 9080}, {"loss": 1.016, "grad_norm": 1.2727786302566528, "learning_rate": 0.0002, "epoch": 7.6482961716449305, "step": 9090}, {"loss": 1.0046, "grad_norm": 1.1601275205612183, "learning_rate": 0.0002, "epoch": 7.656710138830459, "step": 9100}, {"loss": 1.1032, "grad_norm": 1.5492266416549683, "learning_rate": 0.0002, "epoch": 7.665124106015987, "step": 9110}, {"loss": 1.0174, "grad_norm": 1.4239033460617065, "learning_rate": 0.0002, "epoch": 7.673538073201515, "step": 9120}, {"loss": 0.9972, "grad_norm": 1.4212028980255127, "learning_rate": 0.0002, "epoch": 7.681952040387042, "step": 9130}, {"loss": 1.0802, "grad_norm": 1.116467833518982, "learning_rate": 0.0002, "epoch": 7.69036600757257, "step": 9140}, {"loss": 1.0311, "grad_norm": 1.299910545349121, "learning_rate": 0.0002, "epoch": 7.698779974758098, "step": 9150}, {"loss": 1.0262, "grad_norm": 1.404690146446228, "learning_rate": 0.0002, "epoch": 7.707193941943626, "step": 9160}, {"loss": 0.9633, "grad_norm": 1.383244276046753, "learning_rate": 0.0002, "epoch": 7.7156079091291545, "step": 9170}, {"loss": 1.0563, "grad_norm": 1.5001360177993774, "learning_rate": 0.0002, "epoch": 7.724021876314683, "step": 9180}, {"loss": 1.0731, "grad_norm": 1.4455186128616333, "learning_rate": 0.0002, "epoch": 7.732435843500211, "step": 9190}, {"loss": 1.0667, "grad_norm": 1.294964075088501, "learning_rate": 0.0002, "epoch": 7.740849810685738, "step": 9200}, {"loss": 0.9649, "grad_norm": 1.31305730342865, "learning_rate": 0.0002, "epoch": 7.749263777871266, "step": 9210}, {"loss": 0.9883, "grad_norm": 1.3849674463272095, "learning_rate": 0.0002, "epoch": 7.757677745056794, "step": 9220}, {"loss": 1.0219, "grad_norm": 1.6689352989196777, "learning_rate": 0.0002, "epoch": 7.766091712242322, "step": 9230}, {"loss": 1.03, "grad_norm": 1.416099190711975, "learning_rate": 0.0002, "epoch": 7.77450567942785, "step": 9240}, {"loss": 1.0429, "grad_norm": 1.5212045907974243, "learning_rate": 0.0002, "epoch": 7.7829196466133785, "step": 9250}, {"loss": 1.0607, "grad_norm": 1.3623390197753906, "learning_rate": 0.0002, "epoch": 7.791333613798907, "step": 9260}, {"loss": 1.0469, "grad_norm": 1.304148554801941, "learning_rate": 0.0002, "epoch": 7.799747580984434, "step": 9270}, {"loss": 1.0316, "grad_norm": 1.3833202123641968, "learning_rate": 0.0002, "epoch": 7.808161548169962, "step": 9280}, {"loss": 1.0122, "grad_norm": 1.3440886735916138, "learning_rate": 0.0002, "epoch": 7.81657551535549, "step": 9290}, {"loss": 1.0268, "grad_norm": 1.2798155546188354, "learning_rate": 0.0002, "epoch": 7.824989482541018, "step": 9300}, {"loss": 1.0521, "grad_norm": 1.3755156993865967, "learning_rate": 0.0002, "epoch": 7.833403449726546, "step": 9310}, {"loss": 1.0571, "grad_norm": 1.3145397901535034, "learning_rate": 0.0002, "epoch": 7.841817416912074, "step": 9320}, {"loss": 1.0684, "grad_norm": 1.6102794408798218, "learning_rate": 0.0002, "epoch": 7.850231384097602, "step": 9330}, {"loss": 1.1878, "grad_norm": 1.3959331512451172, "learning_rate": 0.0002, "epoch": 7.85864535128313, "step": 9340}, {"loss": 1.05, "grad_norm": 1.4965628385543823, "learning_rate": 0.0002, "epoch": 7.867059318468658, "step": 9350}, {"loss": 1.085, "grad_norm": 1.194201946258545, "learning_rate": 0.0002, "epoch": 7.875473285654186, "step": 9360}, {"loss": 1.0712, "grad_norm": 1.4831446409225464, "learning_rate": 0.0002, "epoch": 7.883887252839714, "step": 9370}, {"loss": 1.0568, "grad_norm": 1.3473302125930786, "learning_rate": 0.0002, "epoch": 7.892301220025242, "step": 9380}, {"loss": 1.0172, "grad_norm": 1.4373382329940796, "learning_rate": 0.0002, "epoch": 7.9007151872107695, "step": 9390}, {"loss": 0.9892, "grad_norm": 1.4341524839401245, "learning_rate": 0.0002, "epoch": 7.909129154396298, "step": 9400}, {"loss": 1.0428, "grad_norm": 1.3210171461105347, "learning_rate": 0.0002, "epoch": 7.917543121581826, "step": 9410}, {"loss": 1.0543, "grad_norm": 1.2708462476730347, "learning_rate": 0.0002, "epoch": 7.925957088767354, "step": 9420}, {"loss": 1.0789, "grad_norm": 1.4132758378982544, "learning_rate": 0.0002, "epoch": 7.934371055952882, "step": 9430}, {"loss": 1.095, "grad_norm": 1.5193610191345215, "learning_rate": 0.0002, "epoch": 7.94278502313841, "step": 9440}, {"loss": 0.967, "grad_norm": 1.427832841873169, "learning_rate": 0.0002, "epoch": 7.951198990323938, "step": 9450}, {"loss": 1.0052, "grad_norm": 1.380478024482727, "learning_rate": 0.0002, "epoch": 7.959612957509465, "step": 9460}, {"loss": 1.1032, "grad_norm": 1.3083926439285278, "learning_rate": 0.0002, "epoch": 7.9680269246949935, "step": 9470}, {"loss": 1.0883, "grad_norm": 1.3049120903015137, "learning_rate": 0.0002, "epoch": 7.976440891880522, "step": 9480}, {"loss": 1.0123, "grad_norm": 1.42048978805542, "learning_rate": 0.0002, "epoch": 7.98485485906605, "step": 9490}, {"loss": 1.094, "grad_norm": 1.2492578029632568, "learning_rate": 0.0002, "epoch": 7.993268826251578, "step": 9500}]}