diff --git a/.gitattributes b/.gitattributes index 91ed1dcbd4854689b7730daa09a270718fcbc0e9..295d77e2899184fa0a855f0100e981338d9fa84e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -758,3 +758,12 @@ gemma-2-9b-it_int4_flare-multifin-en_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64 gemma-2-9b-it_int4_flare-multifin-en_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-381-sd-10000/checkpoint-41/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-multifin-en_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-381-sd-10000/checkpoint-83/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-multifin-en_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-381-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..952cf5145a9d7e80dc7491c40196d56467129957 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34589486cf3edd2be3ba9bcb7e87563196b7a9856dd90683bdf986b557d160d3 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5ed45857f07ffbbcce2f9410adbcf2cd60a5c8d --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883360fd592cf3f1cf085ec03770d65ae5241b54da17c86f6db6f4f70ce499a8 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a68155f67b0133f239047f7a2eb145441938bdff --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd7caf80bbf03bf5747a4ba0cc703529da16bea513682a4690f675e2e20cae3 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..025e69ad7f67d6a788b190469f6051bef8281138 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1375a54d766cb1abeca3dad9485f85b97541cebfc58a1329ef4d99ff69804a +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..62a919e1a561bd1a0770f96e1ccb1a54fd57180b --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae30aebcd5e08c368e71a3a77f80fbed9e73d7d4d684c7458f94879c2a9decfc +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a872649c2c697c66240121bb952ba312ae591f31 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/trainer_state.json @@ -0,0 +1,1183 @@ +{ + "best_metric": 0.5858802795410156, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1628, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.989741003354931e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..46eaef9a7fbc312796d355396116da61490c3b13 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f24b12fed305796f863449b57cd0ca3bea1dd442a5386a9d41f4b499b36f2a6 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c0d955d1e84189853efefe27584e5e9f8f1f9cd --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6360e0676635c3589863b882bd927880b9ec693755297dfddcc3130175d2487 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a019f0003f92dd6e57e5e23ac3a3287143e4b93 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f0860b0a43c49be22e2b9b0d95e29c108b949a5cb181307a11d81b37adc103 +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..096cfabe688f366275d946a809aca82f5f0b7654 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a0e23ad59ba3c3d31a165bed1489745fdd8e1e429ffb2f2f32f2c1b0f676dd +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f05fcc3bc72f85b8e52398bbc7a2ede782bc717 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/trainer_state.json @@ -0,0 +1,1765 @@ +{ + "best_metric": 0.49752503633499146, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2442, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.984611505032397e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57c25da613bb2eab59bfbe92e5288d113d8e3f61 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aaf02c68747e4e2e3ba81e34517865365822600ec522965f8661c772b87c326 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..417c816b8122705daa2420c003716555a4d3e9e7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1de34fd1171b91bc0438cda2d7077c8f3b303bbff2df442be3951728fa9d62d +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc18d93e2a50b75675b4a4dc3085ac8f7624db71 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e96e481c03422b79cfdd79b7439da0fa8a60de846a239e6cdcd715515b106b9 +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef656b45339a207e8b5d30d5d36c482c3394e9c6 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052b1bd34e6c16d4a674059140403503bdd5c083ad416c0d4553a3abac51e2d3 +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..51b8ef89047f52ce2eb83abfcb96b7830d0aca1c --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/trainer_state.json @@ -0,0 +1,2340 @@ +{ + "best_metric": 0.4401616156101227, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3256, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + }, + { + "epoch": 3.0098280098280097, + "grad_norm": 0.7644656896591187, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 2450 + }, + { + "epoch": 3.022113022113022, + "grad_norm": 0.9077525734901428, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2460 + }, + { + "epoch": 3.0343980343980346, + "grad_norm": 0.7859287261962891, + "learning_rate": 0.0002, + "loss": 0.422, + "step": 2470 + }, + { + "epoch": 3.0466830466830466, + "grad_norm": 1.1200323104858398, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 2480 + }, + { + "epoch": 3.058968058968059, + "grad_norm": 0.7570453882217407, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 2490 + }, + { + "epoch": 3.0712530712530715, + "grad_norm": 0.9450915455818176, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 2500 + }, + { + "epoch": 3.0835380835380835, + "grad_norm": 0.8303545117378235, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 2510 + }, + { + "epoch": 3.095823095823096, + "grad_norm": 0.8864443898200989, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2520 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.945324718952179, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 2530 + }, + { + "epoch": 3.1203931203931203, + "grad_norm": 1.0562494993209839, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 2540 + }, + { + "epoch": 3.1326781326781328, + "grad_norm": 0.8607500195503235, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2550 + }, + { + "epoch": 3.1449631449631448, + "grad_norm": 0.8719640374183655, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 2560 + }, + { + "epoch": 3.157248157248157, + "grad_norm": 0.8647059202194214, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 2570 + }, + { + "epoch": 3.1695331695331697, + "grad_norm": 0.8346507549285889, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 2580 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 1.0208854675292969, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 2590 + }, + { + "epoch": 3.194103194103194, + "grad_norm": 0.7064385414123535, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 2600 + }, + { + "epoch": 3.2063882063882065, + "grad_norm": 0.927347719669342, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 2610 + }, + { + "epoch": 3.2186732186732185, + "grad_norm": 0.943517804145813, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 2620 + }, + { + "epoch": 3.230958230958231, + "grad_norm": 0.7837198376655579, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 2630 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.7752765417098999, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 2640 + }, + { + "epoch": 3.2555282555282554, + "grad_norm": 0.8578953146934509, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2650 + }, + { + "epoch": 3.267813267813268, + "grad_norm": 1.0209529399871826, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2660 + }, + { + "epoch": 3.2800982800982803, + "grad_norm": 0.9069030284881592, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 2670 + }, + { + "epoch": 3.2923832923832923, + "grad_norm": 0.8454729318618774, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 2680 + }, + { + "epoch": 3.3046683046683047, + "grad_norm": 0.8253099322319031, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 3.3169533169533167, + "grad_norm": 0.8765934109687805, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 2700 + }, + { + "epoch": 3.329238329238329, + "grad_norm": 0.8149126172065735, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 2710 + }, + { + "epoch": 3.3415233415233416, + "grad_norm": 0.8820102214813232, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2720 + }, + { + "epoch": 3.3538083538083536, + "grad_norm": 0.8813952803611755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 2730 + }, + { + "epoch": 3.366093366093366, + "grad_norm": 1.0338447093963623, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 2740 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2750 + }, + { + "epoch": 3.3906633906633905, + "grad_norm": 0.9017151594161987, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 2760 + }, + { + "epoch": 3.402948402948403, + "grad_norm": 0.8647638559341431, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 2770 + }, + { + "epoch": 3.4152334152334154, + "grad_norm": 0.8298183679580688, + "learning_rate": 0.0002, + "loss": 0.4131, + "step": 2780 + }, + { + "epoch": 3.4275184275184274, + "grad_norm": 0.9298108816146851, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 2790 + }, + { + "epoch": 3.43980343980344, + "grad_norm": 0.8909980058670044, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 2800 + }, + { + "epoch": 3.4520884520884523, + "grad_norm": 0.8027496933937073, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2810 + }, + { + "epoch": 3.4643734643734643, + "grad_norm": 0.8766195774078369, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 2820 + }, + { + "epoch": 3.4766584766584767, + "grad_norm": 0.8194443583488464, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 2830 + }, + { + "epoch": 3.488943488943489, + "grad_norm": 0.9862873554229736, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 2840 + }, + { + "epoch": 3.501228501228501, + "grad_norm": 0.8755377531051636, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2850 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7300266027450562, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 2860 + }, + { + "epoch": 3.5257985257985256, + "grad_norm": 0.8342461585998535, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 2870 + }, + { + "epoch": 3.538083538083538, + "grad_norm": 0.8624151349067688, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 2880 + }, + { + "epoch": 3.5503685503685505, + "grad_norm": 0.8931261301040649, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 2890 + }, + { + "epoch": 3.562653562653563, + "grad_norm": 0.8617086410522461, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 2900 + }, + { + "epoch": 3.574938574938575, + "grad_norm": 0.8754099607467651, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 2910 + }, + { + "epoch": 3.5872235872235874, + "grad_norm": 0.8345834612846375, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2920 + }, + { + "epoch": 3.5995085995085994, + "grad_norm": 1.1414062976837158, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2930 + }, + { + "epoch": 3.611793611793612, + "grad_norm": 0.994860053062439, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 2940 + }, + { + "epoch": 3.6240786240786242, + "grad_norm": 1.19268000125885, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 2950 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8399543762207031, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.9873217940330505, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 2970 + }, + { + "epoch": 3.6609336609336607, + "grad_norm": 0.9116013646125793, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2980 + }, + { + "epoch": 3.673218673218673, + "grad_norm": 0.9503833651542664, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2990 + }, + { + "epoch": 3.6855036855036856, + "grad_norm": 0.9401112794876099, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 3000 + }, + { + "epoch": 3.697788697788698, + "grad_norm": 1.00745689868927, + "learning_rate": 0.0002, + "loss": 0.4333, + "step": 3010 + }, + { + "epoch": 3.71007371007371, + "grad_norm": 1.0553191900253296, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3020 + }, + { + "epoch": 3.7223587223587224, + "grad_norm": 1.0226953029632568, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 3030 + }, + { + "epoch": 3.7346437346437344, + "grad_norm": 1.085554838180542, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 3040 + }, + { + "epoch": 3.746928746928747, + "grad_norm": 0.9948731064796448, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 3050 + }, + { + "epoch": 3.7592137592137593, + "grad_norm": 0.9328727126121521, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 3060 + }, + { + "epoch": 3.7714987714987718, + "grad_norm": 1.0533266067504883, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3070 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.8213809132575989, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 3080 + }, + { + "epoch": 3.796068796068796, + "grad_norm": 0.8941594362258911, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 3090 + }, + { + "epoch": 3.808353808353808, + "grad_norm": 0.8324518203735352, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 3100 + }, + { + "epoch": 3.8206388206388207, + "grad_norm": 0.8811233639717102, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 3110 + }, + { + "epoch": 3.832923832923833, + "grad_norm": 0.8781470060348511, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 3120 + }, + { + "epoch": 3.845208845208845, + "grad_norm": 0.8994116187095642, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3130 + }, + { + "epoch": 3.8574938574938575, + "grad_norm": 0.8605017066001892, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 3140 + }, + { + "epoch": 3.8697788697788695, + "grad_norm": 0.8966400027275085, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 3150 + }, + { + "epoch": 3.882063882063882, + "grad_norm": 0.8856554627418518, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 3160 + }, + { + "epoch": 3.8943488943488944, + "grad_norm": 0.8971620798110962, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 3170 + }, + { + "epoch": 3.906633906633907, + "grad_norm": 0.9807813167572021, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 3180 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.8614121675491333, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 3190 + }, + { + "epoch": 3.9312039312039313, + "grad_norm": 0.989171028137207, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 3200 + }, + { + "epoch": 3.9434889434889433, + "grad_norm": 0.8168872594833374, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 3210 + }, + { + "epoch": 3.9557739557739557, + "grad_norm": 0.8109386563301086, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3220 + }, + { + "epoch": 3.968058968058968, + "grad_norm": 1.0175853967666626, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 3230 + }, + { + "epoch": 3.98034398034398, + "grad_norm": 0.936143159866333, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3240 + }, + { + "epoch": 3.9926289926289926, + "grad_norm": 0.9557915925979614, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3250 + }, + { + "epoch": 4.0, + "eval_loss": 0.4401616156101227, + "eval_runtime": 20.8047, + "eval_samples_per_second": 15.91, + "eval_steps_per_second": 2.019, + "step": 3256 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.979482006709862e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dda251d96712ace49699a3e1887545dcaab5b60 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b154e3c9ff723f8964d80fb6716a2a5235fdd4102c7877cb3e9efa7fc1697b4 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9deb5105b82a76bcdcfc50284cccef1da2185187 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb292829920897a5d39a38ed5a6be6013bb568f65f6dd09c3fc76d9afbe4b17 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..60ff9b2a12b630e20a5573d41a49c10355cab5c6 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f690b2b370d5088d36b551f18fabfa9ce1a4948c7688bca4601e27707c6a691 +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd4d3dc7ab541de64c7cd8d5eddfe0c7f7c2daea --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:658ed4b5e4b2f574242efa0fcb5a3d72b4a37a60e9d7c28715ade14b38c49030 +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f56a06a08c681a18e89a79f45f6f5a1430dbdb11 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/trainer_state.json @@ -0,0 +1,2922 @@ +{ + "best_metric": 0.4017423093318939, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 4070, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + }, + { + "epoch": 3.0098280098280097, + "grad_norm": 0.7644656896591187, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 2450 + }, + { + "epoch": 3.022113022113022, + "grad_norm": 0.9077525734901428, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2460 + }, + { + "epoch": 3.0343980343980346, + "grad_norm": 0.7859287261962891, + "learning_rate": 0.0002, + "loss": 0.422, + "step": 2470 + }, + { + "epoch": 3.0466830466830466, + "grad_norm": 1.1200323104858398, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 2480 + }, + { + "epoch": 3.058968058968059, + "grad_norm": 0.7570453882217407, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 2490 + }, + { + "epoch": 3.0712530712530715, + "grad_norm": 0.9450915455818176, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 2500 + }, + { + "epoch": 3.0835380835380835, + "grad_norm": 0.8303545117378235, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 2510 + }, + { + "epoch": 3.095823095823096, + "grad_norm": 0.8864443898200989, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2520 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.945324718952179, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 2530 + }, + { + "epoch": 3.1203931203931203, + "grad_norm": 1.0562494993209839, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 2540 + }, + { + "epoch": 3.1326781326781328, + "grad_norm": 0.8607500195503235, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2550 + }, + { + "epoch": 3.1449631449631448, + "grad_norm": 0.8719640374183655, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 2560 + }, + { + "epoch": 3.157248157248157, + "grad_norm": 0.8647059202194214, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 2570 + }, + { + "epoch": 3.1695331695331697, + "grad_norm": 0.8346507549285889, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 2580 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 1.0208854675292969, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 2590 + }, + { + "epoch": 3.194103194103194, + "grad_norm": 0.7064385414123535, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 2600 + }, + { + "epoch": 3.2063882063882065, + "grad_norm": 0.927347719669342, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 2610 + }, + { + "epoch": 3.2186732186732185, + "grad_norm": 0.943517804145813, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 2620 + }, + { + "epoch": 3.230958230958231, + "grad_norm": 0.7837198376655579, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 2630 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.7752765417098999, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 2640 + }, + { + "epoch": 3.2555282555282554, + "grad_norm": 0.8578953146934509, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2650 + }, + { + "epoch": 3.267813267813268, + "grad_norm": 1.0209529399871826, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2660 + }, + { + "epoch": 3.2800982800982803, + "grad_norm": 0.9069030284881592, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 2670 + }, + { + "epoch": 3.2923832923832923, + "grad_norm": 0.8454729318618774, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 2680 + }, + { + "epoch": 3.3046683046683047, + "grad_norm": 0.8253099322319031, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 3.3169533169533167, + "grad_norm": 0.8765934109687805, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 2700 + }, + { + "epoch": 3.329238329238329, + "grad_norm": 0.8149126172065735, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 2710 + }, + { + "epoch": 3.3415233415233416, + "grad_norm": 0.8820102214813232, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2720 + }, + { + "epoch": 3.3538083538083536, + "grad_norm": 0.8813952803611755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 2730 + }, + { + "epoch": 3.366093366093366, + "grad_norm": 1.0338447093963623, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 2740 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2750 + }, + { + "epoch": 3.3906633906633905, + "grad_norm": 0.9017151594161987, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 2760 + }, + { + "epoch": 3.402948402948403, + "grad_norm": 0.8647638559341431, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 2770 + }, + { + "epoch": 3.4152334152334154, + "grad_norm": 0.8298183679580688, + "learning_rate": 0.0002, + "loss": 0.4131, + "step": 2780 + }, + { + "epoch": 3.4275184275184274, + "grad_norm": 0.9298108816146851, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 2790 + }, + { + "epoch": 3.43980343980344, + "grad_norm": 0.8909980058670044, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 2800 + }, + { + "epoch": 3.4520884520884523, + "grad_norm": 0.8027496933937073, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2810 + }, + { + "epoch": 3.4643734643734643, + "grad_norm": 0.8766195774078369, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 2820 + }, + { + "epoch": 3.4766584766584767, + "grad_norm": 0.8194443583488464, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 2830 + }, + { + "epoch": 3.488943488943489, + "grad_norm": 0.9862873554229736, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 2840 + }, + { + "epoch": 3.501228501228501, + "grad_norm": 0.8755377531051636, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2850 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7300266027450562, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 2860 + }, + { + "epoch": 3.5257985257985256, + "grad_norm": 0.8342461585998535, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 2870 + }, + { + "epoch": 3.538083538083538, + "grad_norm": 0.8624151349067688, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 2880 + }, + { + "epoch": 3.5503685503685505, + "grad_norm": 0.8931261301040649, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 2890 + }, + { + "epoch": 3.562653562653563, + "grad_norm": 0.8617086410522461, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 2900 + }, + { + "epoch": 3.574938574938575, + "grad_norm": 0.8754099607467651, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 2910 + }, + { + "epoch": 3.5872235872235874, + "grad_norm": 0.8345834612846375, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2920 + }, + { + "epoch": 3.5995085995085994, + "grad_norm": 1.1414062976837158, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2930 + }, + { + "epoch": 3.611793611793612, + "grad_norm": 0.994860053062439, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 2940 + }, + { + "epoch": 3.6240786240786242, + "grad_norm": 1.19268000125885, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 2950 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8399543762207031, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.9873217940330505, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 2970 + }, + { + "epoch": 3.6609336609336607, + "grad_norm": 0.9116013646125793, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2980 + }, + { + "epoch": 3.673218673218673, + "grad_norm": 0.9503833651542664, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2990 + }, + { + "epoch": 3.6855036855036856, + "grad_norm": 0.9401112794876099, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 3000 + }, + { + "epoch": 3.697788697788698, + "grad_norm": 1.00745689868927, + "learning_rate": 0.0002, + "loss": 0.4333, + "step": 3010 + }, + { + "epoch": 3.71007371007371, + "grad_norm": 1.0553191900253296, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3020 + }, + { + "epoch": 3.7223587223587224, + "grad_norm": 1.0226953029632568, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 3030 + }, + { + "epoch": 3.7346437346437344, + "grad_norm": 1.085554838180542, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 3040 + }, + { + "epoch": 3.746928746928747, + "grad_norm": 0.9948731064796448, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 3050 + }, + { + "epoch": 3.7592137592137593, + "grad_norm": 0.9328727126121521, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 3060 + }, + { + "epoch": 3.7714987714987718, + "grad_norm": 1.0533266067504883, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3070 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.8213809132575989, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 3080 + }, + { + "epoch": 3.796068796068796, + "grad_norm": 0.8941594362258911, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 3090 + }, + { + "epoch": 3.808353808353808, + "grad_norm": 0.8324518203735352, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 3100 + }, + { + "epoch": 3.8206388206388207, + "grad_norm": 0.8811233639717102, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 3110 + }, + { + "epoch": 3.832923832923833, + "grad_norm": 0.8781470060348511, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 3120 + }, + { + "epoch": 3.845208845208845, + "grad_norm": 0.8994116187095642, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3130 + }, + { + "epoch": 3.8574938574938575, + "grad_norm": 0.8605017066001892, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 3140 + }, + { + "epoch": 3.8697788697788695, + "grad_norm": 0.8966400027275085, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 3150 + }, + { + "epoch": 3.882063882063882, + "grad_norm": 0.8856554627418518, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 3160 + }, + { + "epoch": 3.8943488943488944, + "grad_norm": 0.8971620798110962, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 3170 + }, + { + "epoch": 3.906633906633907, + "grad_norm": 0.9807813167572021, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 3180 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.8614121675491333, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 3190 + }, + { + "epoch": 3.9312039312039313, + "grad_norm": 0.989171028137207, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 3200 + }, + { + "epoch": 3.9434889434889433, + "grad_norm": 0.8168872594833374, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 3210 + }, + { + "epoch": 3.9557739557739557, + "grad_norm": 0.8109386563301086, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3220 + }, + { + "epoch": 3.968058968058968, + "grad_norm": 1.0175853967666626, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 3230 + }, + { + "epoch": 3.98034398034398, + "grad_norm": 0.936143159866333, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3240 + }, + { + "epoch": 3.9926289926289926, + "grad_norm": 0.9557915925979614, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3250 + }, + { + "epoch": 4.0, + "eval_loss": 0.4401616156101227, + "eval_runtime": 20.8047, + "eval_samples_per_second": 15.91, + "eval_steps_per_second": 2.019, + "step": 3256 + }, + { + "epoch": 4.004914004914005, + "grad_norm": 0.7590614557266235, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 3260 + }, + { + "epoch": 4.017199017199017, + "grad_norm": 0.8920791149139404, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 3270 + }, + { + "epoch": 4.0294840294840295, + "grad_norm": 0.8640421628952026, + "learning_rate": 0.0002, + "loss": 0.3789, + "step": 3280 + }, + { + "epoch": 4.041769041769042, + "grad_norm": 0.9074113965034485, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3290 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 1.0600885152816772, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 3300 + }, + { + "epoch": 4.066339066339066, + "grad_norm": 0.9682773351669312, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 3310 + }, + { + "epoch": 4.078624078624078, + "grad_norm": 0.9326395392417908, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 3320 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.8886597156524658, + "learning_rate": 0.0002, + "loss": 0.3823, + "step": 3330 + }, + { + "epoch": 4.103194103194103, + "grad_norm": 1.032205581665039, + "learning_rate": 0.0002, + "loss": 0.3929, + "step": 3340 + }, + { + "epoch": 4.115479115479116, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 3350 + }, + { + "epoch": 4.127764127764128, + "grad_norm": 0.8250347971916199, + "learning_rate": 0.0002, + "loss": 0.3866, + "step": 3360 + }, + { + "epoch": 4.14004914004914, + "grad_norm": 0.7919842600822449, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3370 + }, + { + "epoch": 4.152334152334152, + "grad_norm": 1.045682430267334, + "learning_rate": 0.0002, + "loss": 0.3838, + "step": 3380 + }, + { + "epoch": 4.164619164619165, + "grad_norm": 0.6873571276664734, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 3390 + }, + { + "epoch": 4.176904176904177, + "grad_norm": 1.0227675437927246, + "learning_rate": 0.0002, + "loss": 0.3942, + "step": 3400 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.9167711734771729, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 3410 + }, + { + "epoch": 4.201474201474202, + "grad_norm": 1.0598796606063843, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 3420 + }, + { + "epoch": 4.2137592137592135, + "grad_norm": 0.8581843972206116, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3430 + }, + { + "epoch": 4.226044226044226, + "grad_norm": 0.8862360119819641, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3440 + }, + { + "epoch": 4.238329238329238, + "grad_norm": 1.0248323678970337, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 3450 + }, + { + "epoch": 4.250614250614251, + "grad_norm": 0.8746261596679688, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3460 + }, + { + "epoch": 4.262899262899263, + "grad_norm": 0.7442536354064941, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 3470 + }, + { + "epoch": 4.275184275184275, + "grad_norm": 0.8295119404792786, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3480 + }, + { + "epoch": 4.287469287469287, + "grad_norm": 1.0634245872497559, + "learning_rate": 0.0002, + "loss": 0.3895, + "step": 3490 + }, + { + "epoch": 4.2997542997543, + "grad_norm": 0.9554621577262878, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3500 + }, + { + "epoch": 4.312039312039312, + "grad_norm": 1.0191723108291626, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3510 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.8573611378669739, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 3520 + }, + { + "epoch": 4.336609336609337, + "grad_norm": 0.9082390069961548, + "learning_rate": 0.0002, + "loss": 0.3869, + "step": 3530 + }, + { + "epoch": 4.348894348894349, + "grad_norm": 0.8650212287902832, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3540 + }, + { + "epoch": 4.361179361179361, + "grad_norm": 0.7186297178268433, + "learning_rate": 0.0002, + "loss": 0.3915, + "step": 3550 + }, + { + "epoch": 4.3734643734643734, + "grad_norm": 0.9750986695289612, + "learning_rate": 0.0002, + "loss": 0.3861, + "step": 3560 + }, + { + "epoch": 4.385749385749386, + "grad_norm": 1.0710467100143433, + "learning_rate": 0.0002, + "loss": 0.3967, + "step": 3570 + }, + { + "epoch": 4.398034398034398, + "grad_norm": 0.7974869012832642, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3580 + }, + { + "epoch": 4.41031941031941, + "grad_norm": 0.9405913949012756, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3590 + }, + { + "epoch": 4.422604422604422, + "grad_norm": 0.9393602609634399, + "learning_rate": 0.0002, + "loss": 0.3982, + "step": 3600 + }, + { + "epoch": 4.434889434889435, + "grad_norm": 1.0798007249832153, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 3610 + }, + { + "epoch": 4.447174447174447, + "grad_norm": 0.9226186275482178, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 3620 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 1.1046524047851562, + "learning_rate": 0.0002, + "loss": 0.3742, + "step": 3630 + }, + { + "epoch": 4.471744471744472, + "grad_norm": 0.8848567605018616, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 3640 + }, + { + "epoch": 4.484029484029484, + "grad_norm": 0.8913224339485168, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 3650 + }, + { + "epoch": 4.496314496314496, + "grad_norm": 0.8497583270072937, + "learning_rate": 0.0002, + "loss": 0.3731, + "step": 3660 + }, + { + "epoch": 4.5085995085995085, + "grad_norm": 0.8263831734657288, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 3670 + }, + { + "epoch": 4.520884520884521, + "grad_norm": 0.8470269441604614, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 3680 + }, + { + "epoch": 4.533169533169533, + "grad_norm": 0.860038161277771, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3690 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.8898552656173706, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3700 + }, + { + "epoch": 4.557739557739557, + "grad_norm": 0.8152070641517639, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 3710 + }, + { + "epoch": 4.57002457002457, + "grad_norm": 0.7847675085067749, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 3720 + }, + { + "epoch": 4.582309582309582, + "grad_norm": 0.9625533819198608, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3730 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.9097456336021423, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 3740 + }, + { + "epoch": 4.606879606879607, + "grad_norm": 0.871329128742218, + "learning_rate": 0.0002, + "loss": 0.3673, + "step": 3750 + }, + { + "epoch": 4.61916461916462, + "grad_norm": 0.9879975914955139, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 3760 + }, + { + "epoch": 4.631449631449631, + "grad_norm": 0.8636731505393982, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3770 + }, + { + "epoch": 4.643734643734644, + "grad_norm": 1.0488964319229126, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 3780 + }, + { + "epoch": 4.656019656019656, + "grad_norm": 0.7637056112289429, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3790 + }, + { + "epoch": 4.6683046683046685, + "grad_norm": 0.8507546186447144, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 3800 + }, + { + "epoch": 4.680589680589681, + "grad_norm": 1.0216856002807617, + "learning_rate": 0.0002, + "loss": 0.3852, + "step": 3810 + }, + { + "epoch": 4.6928746928746925, + "grad_norm": 1.026343822479248, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3820 + }, + { + "epoch": 4.705159705159705, + "grad_norm": 0.8311620950698853, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 3830 + }, + { + "epoch": 4.717444717444717, + "grad_norm": 0.7770653367042542, + "learning_rate": 0.0002, + "loss": 0.3771, + "step": 3840 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.7616215348243713, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3850 + }, + { + "epoch": 4.742014742014742, + "grad_norm": 1.0377072095870972, + "learning_rate": 0.0002, + "loss": 0.3927, + "step": 3860 + }, + { + "epoch": 4.754299754299755, + "grad_norm": 0.9713505506515503, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 3870 + }, + { + "epoch": 4.766584766584766, + "grad_norm": 0.8803321719169617, + "learning_rate": 0.0002, + "loss": 0.3722, + "step": 3880 + }, + { + "epoch": 4.778869778869779, + "grad_norm": 0.885535478591919, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 3890 + }, + { + "epoch": 4.791154791154791, + "grad_norm": 1.0877983570098877, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 3900 + }, + { + "epoch": 4.803439803439804, + "grad_norm": 0.7875366806983948, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3910 + }, + { + "epoch": 4.815724815724816, + "grad_norm": 0.8550102114677429, + "learning_rate": 0.0002, + "loss": 0.3591, + "step": 3920 + }, + { + "epoch": 4.828009828009828, + "grad_norm": 1.0217846632003784, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 3930 + }, + { + "epoch": 4.84029484029484, + "grad_norm": 0.7315713167190552, + "learning_rate": 0.0002, + "loss": 0.3649, + "step": 3940 + }, + { + "epoch": 4.8525798525798525, + "grad_norm": 0.8924923539161682, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3950 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.9730218052864075, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 3960 + }, + { + "epoch": 4.877149877149877, + "grad_norm": 0.9202003479003906, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 3970 + }, + { + "epoch": 4.88943488943489, + "grad_norm": 0.8173081874847412, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 3980 + }, + { + "epoch": 4.901719901719901, + "grad_norm": 0.7178564667701721, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3990 + }, + { + "epoch": 4.914004914004914, + "grad_norm": 0.913684606552124, + "learning_rate": 0.0002, + "loss": 0.3768, + "step": 4000 + }, + { + "epoch": 4.926289926289926, + "grad_norm": 0.8817896842956543, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 4010 + }, + { + "epoch": 4.938574938574939, + "grad_norm": 0.7652186751365662, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 4020 + }, + { + "epoch": 4.950859950859951, + "grad_norm": 0.8828630447387695, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 4030 + }, + { + "epoch": 4.963144963144963, + "grad_norm": 1.0878605842590332, + "learning_rate": 0.0002, + "loss": 0.3672, + "step": 4040 + }, + { + "epoch": 4.975429975429975, + "grad_norm": 1.0845288038253784, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 4050 + }, + { + "epoch": 4.987714987714988, + "grad_norm": 0.8431115746498108, + "learning_rate": 0.0002, + "loss": 0.365, + "step": 4060 + }, + { + "epoch": 5.0, + "grad_norm": 0.8320387601852417, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4070 + }, + { + "epoch": 5.0, + "eval_loss": 0.4017423093318939, + "eval_runtime": 20.8466, + "eval_samples_per_second": 15.878, + "eval_steps_per_second": 2.015, + "step": 4070 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.974352508387328e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..baf6e9f88a0c9d2a9eefbfd9a9a9fb3432594c5b --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b71f08f91e7f50824a7b20915352bfe127f0aa2a852d5e31428e46126e1dedf +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4283d8b55c14ba4bcbfd4417e3b1cdbaf4788bb9 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed23e6dc5cbc64d25655776d5050752680cf8a6479df149be0120c5af55e41d2 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e1bf1b9933f9db104c66a846d993e818a981fb6 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89bfaad1caf651786ff92906ce5950787dae1956b71d26219ecdc6457e0c501 +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f42d47f14a3fb30b00dc602a4c235bfb0a15c115 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32595ef5e315cec2f7af25630a1aa9d39273a49cd98efb5d698e0b329b0096be +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d47cd64a02a98a650c32189036fd2c70459aab42 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/trainer_state.json @@ -0,0 +1,3497 @@ +{ + "best_metric": 0.3778059184551239, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4884, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + }, + { + "epoch": 3.0098280098280097, + "grad_norm": 0.7644656896591187, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 2450 + }, + { + "epoch": 3.022113022113022, + "grad_norm": 0.9077525734901428, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2460 + }, + { + "epoch": 3.0343980343980346, + "grad_norm": 0.7859287261962891, + "learning_rate": 0.0002, + "loss": 0.422, + "step": 2470 + }, + { + "epoch": 3.0466830466830466, + "grad_norm": 1.1200323104858398, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 2480 + }, + { + "epoch": 3.058968058968059, + "grad_norm": 0.7570453882217407, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 2490 + }, + { + "epoch": 3.0712530712530715, + "grad_norm": 0.9450915455818176, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 2500 + }, + { + "epoch": 3.0835380835380835, + "grad_norm": 0.8303545117378235, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 2510 + }, + { + "epoch": 3.095823095823096, + "grad_norm": 0.8864443898200989, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2520 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.945324718952179, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 2530 + }, + { + "epoch": 3.1203931203931203, + "grad_norm": 1.0562494993209839, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 2540 + }, + { + "epoch": 3.1326781326781328, + "grad_norm": 0.8607500195503235, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2550 + }, + { + "epoch": 3.1449631449631448, + "grad_norm": 0.8719640374183655, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 2560 + }, + { + "epoch": 3.157248157248157, + "grad_norm": 0.8647059202194214, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 2570 + }, + { + "epoch": 3.1695331695331697, + "grad_norm": 0.8346507549285889, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 2580 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 1.0208854675292969, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 2590 + }, + { + "epoch": 3.194103194103194, + "grad_norm": 0.7064385414123535, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 2600 + }, + { + "epoch": 3.2063882063882065, + "grad_norm": 0.927347719669342, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 2610 + }, + { + "epoch": 3.2186732186732185, + "grad_norm": 0.943517804145813, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 2620 + }, + { + "epoch": 3.230958230958231, + "grad_norm": 0.7837198376655579, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 2630 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.7752765417098999, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 2640 + }, + { + "epoch": 3.2555282555282554, + "grad_norm": 0.8578953146934509, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2650 + }, + { + "epoch": 3.267813267813268, + "grad_norm": 1.0209529399871826, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2660 + }, + { + "epoch": 3.2800982800982803, + "grad_norm": 0.9069030284881592, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 2670 + }, + { + "epoch": 3.2923832923832923, + "grad_norm": 0.8454729318618774, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 2680 + }, + { + "epoch": 3.3046683046683047, + "grad_norm": 0.8253099322319031, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 3.3169533169533167, + "grad_norm": 0.8765934109687805, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 2700 + }, + { + "epoch": 3.329238329238329, + "grad_norm": 0.8149126172065735, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 2710 + }, + { + "epoch": 3.3415233415233416, + "grad_norm": 0.8820102214813232, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2720 + }, + { + "epoch": 3.3538083538083536, + "grad_norm": 0.8813952803611755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 2730 + }, + { + "epoch": 3.366093366093366, + "grad_norm": 1.0338447093963623, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 2740 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2750 + }, + { + "epoch": 3.3906633906633905, + "grad_norm": 0.9017151594161987, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 2760 + }, + { + "epoch": 3.402948402948403, + "grad_norm": 0.8647638559341431, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 2770 + }, + { + "epoch": 3.4152334152334154, + "grad_norm": 0.8298183679580688, + "learning_rate": 0.0002, + "loss": 0.4131, + "step": 2780 + }, + { + "epoch": 3.4275184275184274, + "grad_norm": 0.9298108816146851, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 2790 + }, + { + "epoch": 3.43980343980344, + "grad_norm": 0.8909980058670044, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 2800 + }, + { + "epoch": 3.4520884520884523, + "grad_norm": 0.8027496933937073, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2810 + }, + { + "epoch": 3.4643734643734643, + "grad_norm": 0.8766195774078369, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 2820 + }, + { + "epoch": 3.4766584766584767, + "grad_norm": 0.8194443583488464, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 2830 + }, + { + "epoch": 3.488943488943489, + "grad_norm": 0.9862873554229736, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 2840 + }, + { + "epoch": 3.501228501228501, + "grad_norm": 0.8755377531051636, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2850 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7300266027450562, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 2860 + }, + { + "epoch": 3.5257985257985256, + "grad_norm": 0.8342461585998535, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 2870 + }, + { + "epoch": 3.538083538083538, + "grad_norm": 0.8624151349067688, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 2880 + }, + { + "epoch": 3.5503685503685505, + "grad_norm": 0.8931261301040649, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 2890 + }, + { + "epoch": 3.562653562653563, + "grad_norm": 0.8617086410522461, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 2900 + }, + { + "epoch": 3.574938574938575, + "grad_norm": 0.8754099607467651, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 2910 + }, + { + "epoch": 3.5872235872235874, + "grad_norm": 0.8345834612846375, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2920 + }, + { + "epoch": 3.5995085995085994, + "grad_norm": 1.1414062976837158, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2930 + }, + { + "epoch": 3.611793611793612, + "grad_norm": 0.994860053062439, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 2940 + }, + { + "epoch": 3.6240786240786242, + "grad_norm": 1.19268000125885, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 2950 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8399543762207031, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.9873217940330505, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 2970 + }, + { + "epoch": 3.6609336609336607, + "grad_norm": 0.9116013646125793, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2980 + }, + { + "epoch": 3.673218673218673, + "grad_norm": 0.9503833651542664, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2990 + }, + { + "epoch": 3.6855036855036856, + "grad_norm": 0.9401112794876099, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 3000 + }, + { + "epoch": 3.697788697788698, + "grad_norm": 1.00745689868927, + "learning_rate": 0.0002, + "loss": 0.4333, + "step": 3010 + }, + { + "epoch": 3.71007371007371, + "grad_norm": 1.0553191900253296, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3020 + }, + { + "epoch": 3.7223587223587224, + "grad_norm": 1.0226953029632568, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 3030 + }, + { + "epoch": 3.7346437346437344, + "grad_norm": 1.085554838180542, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 3040 + }, + { + "epoch": 3.746928746928747, + "grad_norm": 0.9948731064796448, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 3050 + }, + { + "epoch": 3.7592137592137593, + "grad_norm": 0.9328727126121521, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 3060 + }, + { + "epoch": 3.7714987714987718, + "grad_norm": 1.0533266067504883, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3070 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.8213809132575989, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 3080 + }, + { + "epoch": 3.796068796068796, + "grad_norm": 0.8941594362258911, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 3090 + }, + { + "epoch": 3.808353808353808, + "grad_norm": 0.8324518203735352, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 3100 + }, + { + "epoch": 3.8206388206388207, + "grad_norm": 0.8811233639717102, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 3110 + }, + { + "epoch": 3.832923832923833, + "grad_norm": 0.8781470060348511, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 3120 + }, + { + "epoch": 3.845208845208845, + "grad_norm": 0.8994116187095642, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3130 + }, + { + "epoch": 3.8574938574938575, + "grad_norm": 0.8605017066001892, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 3140 + }, + { + "epoch": 3.8697788697788695, + "grad_norm": 0.8966400027275085, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 3150 + }, + { + "epoch": 3.882063882063882, + "grad_norm": 0.8856554627418518, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 3160 + }, + { + "epoch": 3.8943488943488944, + "grad_norm": 0.8971620798110962, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 3170 + }, + { + "epoch": 3.906633906633907, + "grad_norm": 0.9807813167572021, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 3180 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.8614121675491333, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 3190 + }, + { + "epoch": 3.9312039312039313, + "grad_norm": 0.989171028137207, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 3200 + }, + { + "epoch": 3.9434889434889433, + "grad_norm": 0.8168872594833374, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 3210 + }, + { + "epoch": 3.9557739557739557, + "grad_norm": 0.8109386563301086, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3220 + }, + { + "epoch": 3.968058968058968, + "grad_norm": 1.0175853967666626, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 3230 + }, + { + "epoch": 3.98034398034398, + "grad_norm": 0.936143159866333, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3240 + }, + { + "epoch": 3.9926289926289926, + "grad_norm": 0.9557915925979614, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3250 + }, + { + "epoch": 4.0, + "eval_loss": 0.4401616156101227, + "eval_runtime": 20.8047, + "eval_samples_per_second": 15.91, + "eval_steps_per_second": 2.019, + "step": 3256 + }, + { + "epoch": 4.004914004914005, + "grad_norm": 0.7590614557266235, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 3260 + }, + { + "epoch": 4.017199017199017, + "grad_norm": 0.8920791149139404, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 3270 + }, + { + "epoch": 4.0294840294840295, + "grad_norm": 0.8640421628952026, + "learning_rate": 0.0002, + "loss": 0.3789, + "step": 3280 + }, + { + "epoch": 4.041769041769042, + "grad_norm": 0.9074113965034485, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3290 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 1.0600885152816772, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 3300 + }, + { + "epoch": 4.066339066339066, + "grad_norm": 0.9682773351669312, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 3310 + }, + { + "epoch": 4.078624078624078, + "grad_norm": 0.9326395392417908, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 3320 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.8886597156524658, + "learning_rate": 0.0002, + "loss": 0.3823, + "step": 3330 + }, + { + "epoch": 4.103194103194103, + "grad_norm": 1.032205581665039, + "learning_rate": 0.0002, + "loss": 0.3929, + "step": 3340 + }, + { + "epoch": 4.115479115479116, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 3350 + }, + { + "epoch": 4.127764127764128, + "grad_norm": 0.8250347971916199, + "learning_rate": 0.0002, + "loss": 0.3866, + "step": 3360 + }, + { + "epoch": 4.14004914004914, + "grad_norm": 0.7919842600822449, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3370 + }, + { + "epoch": 4.152334152334152, + "grad_norm": 1.045682430267334, + "learning_rate": 0.0002, + "loss": 0.3838, + "step": 3380 + }, + { + "epoch": 4.164619164619165, + "grad_norm": 0.6873571276664734, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 3390 + }, + { + "epoch": 4.176904176904177, + "grad_norm": 1.0227675437927246, + "learning_rate": 0.0002, + "loss": 0.3942, + "step": 3400 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.9167711734771729, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 3410 + }, + { + "epoch": 4.201474201474202, + "grad_norm": 1.0598796606063843, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 3420 + }, + { + "epoch": 4.2137592137592135, + "grad_norm": 0.8581843972206116, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3430 + }, + { + "epoch": 4.226044226044226, + "grad_norm": 0.8862360119819641, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3440 + }, + { + "epoch": 4.238329238329238, + "grad_norm": 1.0248323678970337, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 3450 + }, + { + "epoch": 4.250614250614251, + "grad_norm": 0.8746261596679688, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3460 + }, + { + "epoch": 4.262899262899263, + "grad_norm": 0.7442536354064941, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 3470 + }, + { + "epoch": 4.275184275184275, + "grad_norm": 0.8295119404792786, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3480 + }, + { + "epoch": 4.287469287469287, + "grad_norm": 1.0634245872497559, + "learning_rate": 0.0002, + "loss": 0.3895, + "step": 3490 + }, + { + "epoch": 4.2997542997543, + "grad_norm": 0.9554621577262878, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3500 + }, + { + "epoch": 4.312039312039312, + "grad_norm": 1.0191723108291626, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3510 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.8573611378669739, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 3520 + }, + { + "epoch": 4.336609336609337, + "grad_norm": 0.9082390069961548, + "learning_rate": 0.0002, + "loss": 0.3869, + "step": 3530 + }, + { + "epoch": 4.348894348894349, + "grad_norm": 0.8650212287902832, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3540 + }, + { + "epoch": 4.361179361179361, + "grad_norm": 0.7186297178268433, + "learning_rate": 0.0002, + "loss": 0.3915, + "step": 3550 + }, + { + "epoch": 4.3734643734643734, + "grad_norm": 0.9750986695289612, + "learning_rate": 0.0002, + "loss": 0.3861, + "step": 3560 + }, + { + "epoch": 4.385749385749386, + "grad_norm": 1.0710467100143433, + "learning_rate": 0.0002, + "loss": 0.3967, + "step": 3570 + }, + { + "epoch": 4.398034398034398, + "grad_norm": 0.7974869012832642, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3580 + }, + { + "epoch": 4.41031941031941, + "grad_norm": 0.9405913949012756, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3590 + }, + { + "epoch": 4.422604422604422, + "grad_norm": 0.9393602609634399, + "learning_rate": 0.0002, + "loss": 0.3982, + "step": 3600 + }, + { + "epoch": 4.434889434889435, + "grad_norm": 1.0798007249832153, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 3610 + }, + { + "epoch": 4.447174447174447, + "grad_norm": 0.9226186275482178, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 3620 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 1.1046524047851562, + "learning_rate": 0.0002, + "loss": 0.3742, + "step": 3630 + }, + { + "epoch": 4.471744471744472, + "grad_norm": 0.8848567605018616, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 3640 + }, + { + "epoch": 4.484029484029484, + "grad_norm": 0.8913224339485168, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 3650 + }, + { + "epoch": 4.496314496314496, + "grad_norm": 0.8497583270072937, + "learning_rate": 0.0002, + "loss": 0.3731, + "step": 3660 + }, + { + "epoch": 4.5085995085995085, + "grad_norm": 0.8263831734657288, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 3670 + }, + { + "epoch": 4.520884520884521, + "grad_norm": 0.8470269441604614, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 3680 + }, + { + "epoch": 4.533169533169533, + "grad_norm": 0.860038161277771, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3690 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.8898552656173706, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3700 + }, + { + "epoch": 4.557739557739557, + "grad_norm": 0.8152070641517639, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 3710 + }, + { + "epoch": 4.57002457002457, + "grad_norm": 0.7847675085067749, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 3720 + }, + { + "epoch": 4.582309582309582, + "grad_norm": 0.9625533819198608, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3730 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.9097456336021423, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 3740 + }, + { + "epoch": 4.606879606879607, + "grad_norm": 0.871329128742218, + "learning_rate": 0.0002, + "loss": 0.3673, + "step": 3750 + }, + { + "epoch": 4.61916461916462, + "grad_norm": 0.9879975914955139, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 3760 + }, + { + "epoch": 4.631449631449631, + "grad_norm": 0.8636731505393982, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3770 + }, + { + "epoch": 4.643734643734644, + "grad_norm": 1.0488964319229126, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 3780 + }, + { + "epoch": 4.656019656019656, + "grad_norm": 0.7637056112289429, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3790 + }, + { + "epoch": 4.6683046683046685, + "grad_norm": 0.8507546186447144, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 3800 + }, + { + "epoch": 4.680589680589681, + "grad_norm": 1.0216856002807617, + "learning_rate": 0.0002, + "loss": 0.3852, + "step": 3810 + }, + { + "epoch": 4.6928746928746925, + "grad_norm": 1.026343822479248, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3820 + }, + { + "epoch": 4.705159705159705, + "grad_norm": 0.8311620950698853, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 3830 + }, + { + "epoch": 4.717444717444717, + "grad_norm": 0.7770653367042542, + "learning_rate": 0.0002, + "loss": 0.3771, + "step": 3840 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.7616215348243713, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3850 + }, + { + "epoch": 4.742014742014742, + "grad_norm": 1.0377072095870972, + "learning_rate": 0.0002, + "loss": 0.3927, + "step": 3860 + }, + { + "epoch": 4.754299754299755, + "grad_norm": 0.9713505506515503, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 3870 + }, + { + "epoch": 4.766584766584766, + "grad_norm": 0.8803321719169617, + "learning_rate": 0.0002, + "loss": 0.3722, + "step": 3880 + }, + { + "epoch": 4.778869778869779, + "grad_norm": 0.885535478591919, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 3890 + }, + { + "epoch": 4.791154791154791, + "grad_norm": 1.0877983570098877, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 3900 + }, + { + "epoch": 4.803439803439804, + "grad_norm": 0.7875366806983948, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3910 + }, + { + "epoch": 4.815724815724816, + "grad_norm": 0.8550102114677429, + "learning_rate": 0.0002, + "loss": 0.3591, + "step": 3920 + }, + { + "epoch": 4.828009828009828, + "grad_norm": 1.0217846632003784, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 3930 + }, + { + "epoch": 4.84029484029484, + "grad_norm": 0.7315713167190552, + "learning_rate": 0.0002, + "loss": 0.3649, + "step": 3940 + }, + { + "epoch": 4.8525798525798525, + "grad_norm": 0.8924923539161682, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3950 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.9730218052864075, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 3960 + }, + { + "epoch": 4.877149877149877, + "grad_norm": 0.9202003479003906, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 3970 + }, + { + "epoch": 4.88943488943489, + "grad_norm": 0.8173081874847412, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 3980 + }, + { + "epoch": 4.901719901719901, + "grad_norm": 0.7178564667701721, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3990 + }, + { + "epoch": 4.914004914004914, + "grad_norm": 0.913684606552124, + "learning_rate": 0.0002, + "loss": 0.3768, + "step": 4000 + }, + { + "epoch": 4.926289926289926, + "grad_norm": 0.8817896842956543, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 4010 + }, + { + "epoch": 4.938574938574939, + "grad_norm": 0.7652186751365662, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 4020 + }, + { + "epoch": 4.950859950859951, + "grad_norm": 0.8828630447387695, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 4030 + }, + { + "epoch": 4.963144963144963, + "grad_norm": 1.0878605842590332, + "learning_rate": 0.0002, + "loss": 0.3672, + "step": 4040 + }, + { + "epoch": 4.975429975429975, + "grad_norm": 1.0845288038253784, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 4050 + }, + { + "epoch": 4.987714987714988, + "grad_norm": 0.8431115746498108, + "learning_rate": 0.0002, + "loss": 0.365, + "step": 4060 + }, + { + "epoch": 5.0, + "grad_norm": 0.8320387601852417, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4070 + }, + { + "epoch": 5.0, + "eval_loss": 0.4017423093318939, + "eval_runtime": 20.8466, + "eval_samples_per_second": 15.878, + "eval_steps_per_second": 2.015, + "step": 4070 + }, + { + "epoch": 5.012285012285012, + "grad_norm": 0.8639023900032043, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 4080 + }, + { + "epoch": 5.024570024570025, + "grad_norm": 0.7123713493347168, + "learning_rate": 0.0002, + "loss": 0.3458, + "step": 4090 + }, + { + "epoch": 5.036855036855036, + "grad_norm": 0.9886922836303711, + "learning_rate": 0.0002, + "loss": 0.3404, + "step": 4100 + }, + { + "epoch": 5.049140049140049, + "grad_norm": 0.7880306243896484, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 4110 + }, + { + "epoch": 5.061425061425061, + "grad_norm": 0.7488741874694824, + "learning_rate": 0.0002, + "loss": 0.3406, + "step": 4120 + }, + { + "epoch": 5.073710073710074, + "grad_norm": 0.9359086751937866, + "learning_rate": 0.0002, + "loss": 0.3542, + "step": 4130 + }, + { + "epoch": 5.085995085995086, + "grad_norm": 0.9401527047157288, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 4140 + }, + { + "epoch": 5.098280098280099, + "grad_norm": 0.8396275043487549, + "learning_rate": 0.0002, + "loss": 0.3566, + "step": 4150 + }, + { + "epoch": 5.11056511056511, + "grad_norm": 0.7132664918899536, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4160 + }, + { + "epoch": 5.122850122850123, + "grad_norm": 0.843708872795105, + "learning_rate": 0.0002, + "loss": 0.3457, + "step": 4170 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.8733304738998413, + "learning_rate": 0.0002, + "loss": 0.3399, + "step": 4180 + }, + { + "epoch": 5.1474201474201475, + "grad_norm": 0.9064375162124634, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4190 + }, + { + "epoch": 5.15970515970516, + "grad_norm": 0.900770902633667, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4200 + }, + { + "epoch": 5.171990171990172, + "grad_norm": 0.863853394985199, + "learning_rate": 0.0002, + "loss": 0.3475, + "step": 4210 + }, + { + "epoch": 5.184275184275184, + "grad_norm": 0.767134964466095, + "learning_rate": 0.0002, + "loss": 0.3497, + "step": 4220 + }, + { + "epoch": 5.196560196560196, + "grad_norm": 0.7518735527992249, + "learning_rate": 0.0002, + "loss": 0.3527, + "step": 4230 + }, + { + "epoch": 5.208845208845209, + "grad_norm": 0.8040947914123535, + "learning_rate": 0.0002, + "loss": 0.3369, + "step": 4240 + }, + { + "epoch": 5.221130221130221, + "grad_norm": 0.7827144265174866, + "learning_rate": 0.0002, + "loss": 0.3496, + "step": 4250 + }, + { + "epoch": 5.233415233415234, + "grad_norm": 0.7306333184242249, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4260 + }, + { + "epoch": 5.245700245700245, + "grad_norm": 1.0963380336761475, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4270 + }, + { + "epoch": 5.257985257985258, + "grad_norm": 0.8200454711914062, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4280 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 0.8666796684265137, + "learning_rate": 0.0002, + "loss": 0.3509, + "step": 4290 + }, + { + "epoch": 5.282555282555283, + "grad_norm": 0.7862894535064697, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 4300 + }, + { + "epoch": 5.294840294840295, + "grad_norm": 0.8163095712661743, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4310 + }, + { + "epoch": 5.3071253071253075, + "grad_norm": 0.8069050908088684, + "learning_rate": 0.0002, + "loss": 0.34, + "step": 4320 + }, + { + "epoch": 5.319410319410319, + "grad_norm": 0.7858486175537109, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 4330 + }, + { + "epoch": 5.3316953316953315, + "grad_norm": 0.950339674949646, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4340 + }, + { + "epoch": 5.343980343980344, + "grad_norm": 0.9056477546691895, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4350 + }, + { + "epoch": 5.356265356265356, + "grad_norm": 0.9619399905204773, + "learning_rate": 0.0002, + "loss": 0.3538, + "step": 4360 + }, + { + "epoch": 5.368550368550369, + "grad_norm": 0.9778652191162109, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4370 + }, + { + "epoch": 5.38083538083538, + "grad_norm": 0.6919555068016052, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4380 + }, + { + "epoch": 5.393120393120393, + "grad_norm": 0.8121668696403503, + "learning_rate": 0.0002, + "loss": 0.3426, + "step": 4390 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.8481289148330688, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4400 + }, + { + "epoch": 5.417690417690418, + "grad_norm": 0.8727408647537231, + "learning_rate": 0.0002, + "loss": 0.345, + "step": 4410 + }, + { + "epoch": 5.42997542997543, + "grad_norm": 0.8920271396636963, + "learning_rate": 0.0002, + "loss": 0.3554, + "step": 4420 + }, + { + "epoch": 5.442260442260443, + "grad_norm": 0.7758749723434448, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4430 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.8847506642341614, + "learning_rate": 0.0002, + "loss": 0.3483, + "step": 4440 + }, + { + "epoch": 5.466830466830467, + "grad_norm": 0.9760470390319824, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 4450 + }, + { + "epoch": 5.479115479115479, + "grad_norm": 0.8940271139144897, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 4460 + }, + { + "epoch": 5.4914004914004915, + "grad_norm": 0.8668502569198608, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 4470 + }, + { + "epoch": 5.503685503685504, + "grad_norm": 0.9097439050674438, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4480 + }, + { + "epoch": 5.515970515970516, + "grad_norm": 0.8217208981513977, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 4490 + }, + { + "epoch": 5.528255528255528, + "grad_norm": 0.7853189706802368, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4500 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 1.1113477945327759, + "learning_rate": 0.0002, + "loss": 0.3479, + "step": 4510 + }, + { + "epoch": 5.552825552825553, + "grad_norm": 0.8637538552284241, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4520 + }, + { + "epoch": 5.565110565110565, + "grad_norm": 1.0230066776275635, + "learning_rate": 0.0002, + "loss": 0.3403, + "step": 4530 + }, + { + "epoch": 5.577395577395578, + "grad_norm": 0.8972793817520142, + "learning_rate": 0.0002, + "loss": 0.3588, + "step": 4540 + }, + { + "epoch": 5.58968058968059, + "grad_norm": 0.7950642704963684, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 4550 + }, + { + "epoch": 5.601965601965602, + "grad_norm": 1.113753318786621, + "learning_rate": 0.0002, + "loss": 0.3468, + "step": 4560 + }, + { + "epoch": 5.614250614250614, + "grad_norm": 0.7842669486999512, + "learning_rate": 0.0002, + "loss": 0.3354, + "step": 4570 + }, + { + "epoch": 5.6265356265356266, + "grad_norm": 0.9713512063026428, + "learning_rate": 0.0002, + "loss": 0.3419, + "step": 4580 + }, + { + "epoch": 5.638820638820639, + "grad_norm": 0.9451650977134705, + "learning_rate": 0.0002, + "loss": 0.3502, + "step": 4590 + }, + { + "epoch": 5.651105651105651, + "grad_norm": 1.055484414100647, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4600 + }, + { + "epoch": 5.663390663390663, + "grad_norm": 0.8408507704734802, + "learning_rate": 0.0002, + "loss": 0.3436, + "step": 4610 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 1.0293926000595093, + "learning_rate": 0.0002, + "loss": 0.3619, + "step": 4620 + }, + { + "epoch": 5.687960687960688, + "grad_norm": 0.7198245525360107, + "learning_rate": 0.0002, + "loss": 0.3484, + "step": 4630 + }, + { + "epoch": 5.7002457002457, + "grad_norm": 0.7564466595649719, + "learning_rate": 0.0002, + "loss": 0.3563, + "step": 4640 + }, + { + "epoch": 5.712530712530713, + "grad_norm": 0.7980002760887146, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4650 + }, + { + "epoch": 5.724815724815725, + "grad_norm": 0.8685088753700256, + "learning_rate": 0.0002, + "loss": 0.3478, + "step": 4660 + }, + { + "epoch": 5.737100737100737, + "grad_norm": 0.8816949129104614, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 4670 + }, + { + "epoch": 5.749385749385749, + "grad_norm": 0.7154731750488281, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4680 + }, + { + "epoch": 5.761670761670762, + "grad_norm": 0.9430679678916931, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 4690 + }, + { + "epoch": 5.773955773955774, + "grad_norm": 0.7640151381492615, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4700 + }, + { + "epoch": 5.7862407862407865, + "grad_norm": 1.0920690298080444, + "learning_rate": 0.0002, + "loss": 0.3444, + "step": 4710 + }, + { + "epoch": 5.798525798525798, + "grad_norm": 0.9362104535102844, + "learning_rate": 0.0002, + "loss": 0.3356, + "step": 4720 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.8392294645309448, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 4730 + }, + { + "epoch": 5.823095823095823, + "grad_norm": 0.9893582463264465, + "learning_rate": 0.0002, + "loss": 0.3488, + "step": 4740 + }, + { + "epoch": 5.835380835380835, + "grad_norm": 0.6985510587692261, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 4750 + }, + { + "epoch": 5.847665847665848, + "grad_norm": 0.8906862735748291, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 4760 + }, + { + "epoch": 5.85995085995086, + "grad_norm": 0.8036413192749023, + "learning_rate": 0.0002, + "loss": 0.3481, + "step": 4770 + }, + { + "epoch": 5.872235872235873, + "grad_norm": 0.9948155283927917, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 4780 + }, + { + "epoch": 5.884520884520884, + "grad_norm": 0.8618432283401489, + "learning_rate": 0.0002, + "loss": 0.3385, + "step": 4790 + }, + { + "epoch": 5.896805896805897, + "grad_norm": 1.0422909259796143, + "learning_rate": 0.0002, + "loss": 0.3302, + "step": 4800 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 1.1892569065093994, + "learning_rate": 0.0002, + "loss": 0.3448, + "step": 4810 + }, + { + "epoch": 5.921375921375922, + "grad_norm": 1.1459916830062866, + "learning_rate": 0.0002, + "loss": 0.3506, + "step": 4820 + }, + { + "epoch": 5.933660933660933, + "grad_norm": 1.056235909461975, + "learning_rate": 0.0002, + "loss": 0.3387, + "step": 4830 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 0.8517277240753174, + "learning_rate": 0.0002, + "loss": 0.344, + "step": 4840 + }, + { + "epoch": 5.958230958230958, + "grad_norm": 0.8153380751609802, + "learning_rate": 0.0002, + "loss": 0.3421, + "step": 4850 + }, + { + "epoch": 5.9705159705159705, + "grad_norm": 0.7907533049583435, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4860 + }, + { + "epoch": 5.982800982800983, + "grad_norm": 0.8443069458007812, + "learning_rate": 0.0002, + "loss": 0.3337, + "step": 4870 + }, + { + "epoch": 5.995085995085995, + "grad_norm": 0.8711344003677368, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 4880 + }, + { + "epoch": 6.0, + "eval_loss": 0.3778059184551239, + "eval_runtime": 20.6858, + "eval_samples_per_second": 16.001, + "eval_steps_per_second": 2.03, + "step": 4884 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.969223010064794e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bbbac78111bbbebee8bd59f069df28eae1d558eb --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac100f45a0c499add65357c09818dd6d6facc45332df9c690ab5f0ea75b7ca5f +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c514b4857d536d6dbb6f6c6a1a6aeca2dbbd2c56 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44697f15165884ed87cc7c2f9a1856f13b076d9841d5d132b9ccec5c555f65e7 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..184c86fd018ad198f1d67c1e0e9dfa3593fc3856 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e363cf2a25a5afacf998f3c2bdefd67fef7ceac03284ee2232e8d6ea3bf8af5 +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eefe24e13423ad1b109be1301438e1c95a1b4427 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a03df00df238c3a31bf602f5e2bbbee686d5176e795f07fdbbed25e7e6b01ca +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e11ba2d077be714be514fcb203c720357accabc8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/trainer_state.json @@ -0,0 +1,4072 @@ +{ + "best_metric": 0.3616626560688019, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 5698, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + }, + { + "epoch": 3.0098280098280097, + "grad_norm": 0.7644656896591187, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 2450 + }, + { + "epoch": 3.022113022113022, + "grad_norm": 0.9077525734901428, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2460 + }, + { + "epoch": 3.0343980343980346, + "grad_norm": 0.7859287261962891, + "learning_rate": 0.0002, + "loss": 0.422, + "step": 2470 + }, + { + "epoch": 3.0466830466830466, + "grad_norm": 1.1200323104858398, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 2480 + }, + { + "epoch": 3.058968058968059, + "grad_norm": 0.7570453882217407, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 2490 + }, + { + "epoch": 3.0712530712530715, + "grad_norm": 0.9450915455818176, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 2500 + }, + { + "epoch": 3.0835380835380835, + "grad_norm": 0.8303545117378235, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 2510 + }, + { + "epoch": 3.095823095823096, + "grad_norm": 0.8864443898200989, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2520 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.945324718952179, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 2530 + }, + { + "epoch": 3.1203931203931203, + "grad_norm": 1.0562494993209839, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 2540 + }, + { + "epoch": 3.1326781326781328, + "grad_norm": 0.8607500195503235, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2550 + }, + { + "epoch": 3.1449631449631448, + "grad_norm": 0.8719640374183655, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 2560 + }, + { + "epoch": 3.157248157248157, + "grad_norm": 0.8647059202194214, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 2570 + }, + { + "epoch": 3.1695331695331697, + "grad_norm": 0.8346507549285889, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 2580 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 1.0208854675292969, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 2590 + }, + { + "epoch": 3.194103194103194, + "grad_norm": 0.7064385414123535, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 2600 + }, + { + "epoch": 3.2063882063882065, + "grad_norm": 0.927347719669342, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 2610 + }, + { + "epoch": 3.2186732186732185, + "grad_norm": 0.943517804145813, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 2620 + }, + { + "epoch": 3.230958230958231, + "grad_norm": 0.7837198376655579, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 2630 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.7752765417098999, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 2640 + }, + { + "epoch": 3.2555282555282554, + "grad_norm": 0.8578953146934509, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2650 + }, + { + "epoch": 3.267813267813268, + "grad_norm": 1.0209529399871826, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2660 + }, + { + "epoch": 3.2800982800982803, + "grad_norm": 0.9069030284881592, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 2670 + }, + { + "epoch": 3.2923832923832923, + "grad_norm": 0.8454729318618774, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 2680 + }, + { + "epoch": 3.3046683046683047, + "grad_norm": 0.8253099322319031, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 3.3169533169533167, + "grad_norm": 0.8765934109687805, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 2700 + }, + { + "epoch": 3.329238329238329, + "grad_norm": 0.8149126172065735, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 2710 + }, + { + "epoch": 3.3415233415233416, + "grad_norm": 0.8820102214813232, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2720 + }, + { + "epoch": 3.3538083538083536, + "grad_norm": 0.8813952803611755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 2730 + }, + { + "epoch": 3.366093366093366, + "grad_norm": 1.0338447093963623, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 2740 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2750 + }, + { + "epoch": 3.3906633906633905, + "grad_norm": 0.9017151594161987, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 2760 + }, + { + "epoch": 3.402948402948403, + "grad_norm": 0.8647638559341431, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 2770 + }, + { + "epoch": 3.4152334152334154, + "grad_norm": 0.8298183679580688, + "learning_rate": 0.0002, + "loss": 0.4131, + "step": 2780 + }, + { + "epoch": 3.4275184275184274, + "grad_norm": 0.9298108816146851, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 2790 + }, + { + "epoch": 3.43980343980344, + "grad_norm": 0.8909980058670044, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 2800 + }, + { + "epoch": 3.4520884520884523, + "grad_norm": 0.8027496933937073, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2810 + }, + { + "epoch": 3.4643734643734643, + "grad_norm": 0.8766195774078369, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 2820 + }, + { + "epoch": 3.4766584766584767, + "grad_norm": 0.8194443583488464, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 2830 + }, + { + "epoch": 3.488943488943489, + "grad_norm": 0.9862873554229736, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 2840 + }, + { + "epoch": 3.501228501228501, + "grad_norm": 0.8755377531051636, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2850 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7300266027450562, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 2860 + }, + { + "epoch": 3.5257985257985256, + "grad_norm": 0.8342461585998535, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 2870 + }, + { + "epoch": 3.538083538083538, + "grad_norm": 0.8624151349067688, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 2880 + }, + { + "epoch": 3.5503685503685505, + "grad_norm": 0.8931261301040649, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 2890 + }, + { + "epoch": 3.562653562653563, + "grad_norm": 0.8617086410522461, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 2900 + }, + { + "epoch": 3.574938574938575, + "grad_norm": 0.8754099607467651, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 2910 + }, + { + "epoch": 3.5872235872235874, + "grad_norm": 0.8345834612846375, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2920 + }, + { + "epoch": 3.5995085995085994, + "grad_norm": 1.1414062976837158, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2930 + }, + { + "epoch": 3.611793611793612, + "grad_norm": 0.994860053062439, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 2940 + }, + { + "epoch": 3.6240786240786242, + "grad_norm": 1.19268000125885, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 2950 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8399543762207031, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.9873217940330505, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 2970 + }, + { + "epoch": 3.6609336609336607, + "grad_norm": 0.9116013646125793, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2980 + }, + { + "epoch": 3.673218673218673, + "grad_norm": 0.9503833651542664, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2990 + }, + { + "epoch": 3.6855036855036856, + "grad_norm": 0.9401112794876099, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 3000 + }, + { + "epoch": 3.697788697788698, + "grad_norm": 1.00745689868927, + "learning_rate": 0.0002, + "loss": 0.4333, + "step": 3010 + }, + { + "epoch": 3.71007371007371, + "grad_norm": 1.0553191900253296, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3020 + }, + { + "epoch": 3.7223587223587224, + "grad_norm": 1.0226953029632568, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 3030 + }, + { + "epoch": 3.7346437346437344, + "grad_norm": 1.085554838180542, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 3040 + }, + { + "epoch": 3.746928746928747, + "grad_norm": 0.9948731064796448, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 3050 + }, + { + "epoch": 3.7592137592137593, + "grad_norm": 0.9328727126121521, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 3060 + }, + { + "epoch": 3.7714987714987718, + "grad_norm": 1.0533266067504883, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3070 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.8213809132575989, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 3080 + }, + { + "epoch": 3.796068796068796, + "grad_norm": 0.8941594362258911, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 3090 + }, + { + "epoch": 3.808353808353808, + "grad_norm": 0.8324518203735352, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 3100 + }, + { + "epoch": 3.8206388206388207, + "grad_norm": 0.8811233639717102, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 3110 + }, + { + "epoch": 3.832923832923833, + "grad_norm": 0.8781470060348511, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 3120 + }, + { + "epoch": 3.845208845208845, + "grad_norm": 0.8994116187095642, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3130 + }, + { + "epoch": 3.8574938574938575, + "grad_norm": 0.8605017066001892, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 3140 + }, + { + "epoch": 3.8697788697788695, + "grad_norm": 0.8966400027275085, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 3150 + }, + { + "epoch": 3.882063882063882, + "grad_norm": 0.8856554627418518, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 3160 + }, + { + "epoch": 3.8943488943488944, + "grad_norm": 0.8971620798110962, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 3170 + }, + { + "epoch": 3.906633906633907, + "grad_norm": 0.9807813167572021, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 3180 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.8614121675491333, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 3190 + }, + { + "epoch": 3.9312039312039313, + "grad_norm": 0.989171028137207, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 3200 + }, + { + "epoch": 3.9434889434889433, + "grad_norm": 0.8168872594833374, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 3210 + }, + { + "epoch": 3.9557739557739557, + "grad_norm": 0.8109386563301086, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3220 + }, + { + "epoch": 3.968058968058968, + "grad_norm": 1.0175853967666626, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 3230 + }, + { + "epoch": 3.98034398034398, + "grad_norm": 0.936143159866333, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3240 + }, + { + "epoch": 3.9926289926289926, + "grad_norm": 0.9557915925979614, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3250 + }, + { + "epoch": 4.0, + "eval_loss": 0.4401616156101227, + "eval_runtime": 20.8047, + "eval_samples_per_second": 15.91, + "eval_steps_per_second": 2.019, + "step": 3256 + }, + { + "epoch": 4.004914004914005, + "grad_norm": 0.7590614557266235, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 3260 + }, + { + "epoch": 4.017199017199017, + "grad_norm": 0.8920791149139404, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 3270 + }, + { + "epoch": 4.0294840294840295, + "grad_norm": 0.8640421628952026, + "learning_rate": 0.0002, + "loss": 0.3789, + "step": 3280 + }, + { + "epoch": 4.041769041769042, + "grad_norm": 0.9074113965034485, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3290 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 1.0600885152816772, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 3300 + }, + { + "epoch": 4.066339066339066, + "grad_norm": 0.9682773351669312, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 3310 + }, + { + "epoch": 4.078624078624078, + "grad_norm": 0.9326395392417908, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 3320 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.8886597156524658, + "learning_rate": 0.0002, + "loss": 0.3823, + "step": 3330 + }, + { + "epoch": 4.103194103194103, + "grad_norm": 1.032205581665039, + "learning_rate": 0.0002, + "loss": 0.3929, + "step": 3340 + }, + { + "epoch": 4.115479115479116, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 3350 + }, + { + "epoch": 4.127764127764128, + "grad_norm": 0.8250347971916199, + "learning_rate": 0.0002, + "loss": 0.3866, + "step": 3360 + }, + { + "epoch": 4.14004914004914, + "grad_norm": 0.7919842600822449, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3370 + }, + { + "epoch": 4.152334152334152, + "grad_norm": 1.045682430267334, + "learning_rate": 0.0002, + "loss": 0.3838, + "step": 3380 + }, + { + "epoch": 4.164619164619165, + "grad_norm": 0.6873571276664734, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 3390 + }, + { + "epoch": 4.176904176904177, + "grad_norm": 1.0227675437927246, + "learning_rate": 0.0002, + "loss": 0.3942, + "step": 3400 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.9167711734771729, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 3410 + }, + { + "epoch": 4.201474201474202, + "grad_norm": 1.0598796606063843, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 3420 + }, + { + "epoch": 4.2137592137592135, + "grad_norm": 0.8581843972206116, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3430 + }, + { + "epoch": 4.226044226044226, + "grad_norm": 0.8862360119819641, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3440 + }, + { + "epoch": 4.238329238329238, + "grad_norm": 1.0248323678970337, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 3450 + }, + { + "epoch": 4.250614250614251, + "grad_norm": 0.8746261596679688, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3460 + }, + { + "epoch": 4.262899262899263, + "grad_norm": 0.7442536354064941, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 3470 + }, + { + "epoch": 4.275184275184275, + "grad_norm": 0.8295119404792786, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3480 + }, + { + "epoch": 4.287469287469287, + "grad_norm": 1.0634245872497559, + "learning_rate": 0.0002, + "loss": 0.3895, + "step": 3490 + }, + { + "epoch": 4.2997542997543, + "grad_norm": 0.9554621577262878, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3500 + }, + { + "epoch": 4.312039312039312, + "grad_norm": 1.0191723108291626, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3510 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.8573611378669739, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 3520 + }, + { + "epoch": 4.336609336609337, + "grad_norm": 0.9082390069961548, + "learning_rate": 0.0002, + "loss": 0.3869, + "step": 3530 + }, + { + "epoch": 4.348894348894349, + "grad_norm": 0.8650212287902832, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3540 + }, + { + "epoch": 4.361179361179361, + "grad_norm": 0.7186297178268433, + "learning_rate": 0.0002, + "loss": 0.3915, + "step": 3550 + }, + { + "epoch": 4.3734643734643734, + "grad_norm": 0.9750986695289612, + "learning_rate": 0.0002, + "loss": 0.3861, + "step": 3560 + }, + { + "epoch": 4.385749385749386, + "grad_norm": 1.0710467100143433, + "learning_rate": 0.0002, + "loss": 0.3967, + "step": 3570 + }, + { + "epoch": 4.398034398034398, + "grad_norm": 0.7974869012832642, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3580 + }, + { + "epoch": 4.41031941031941, + "grad_norm": 0.9405913949012756, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3590 + }, + { + "epoch": 4.422604422604422, + "grad_norm": 0.9393602609634399, + "learning_rate": 0.0002, + "loss": 0.3982, + "step": 3600 + }, + { + "epoch": 4.434889434889435, + "grad_norm": 1.0798007249832153, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 3610 + }, + { + "epoch": 4.447174447174447, + "grad_norm": 0.9226186275482178, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 3620 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 1.1046524047851562, + "learning_rate": 0.0002, + "loss": 0.3742, + "step": 3630 + }, + { + "epoch": 4.471744471744472, + "grad_norm": 0.8848567605018616, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 3640 + }, + { + "epoch": 4.484029484029484, + "grad_norm": 0.8913224339485168, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 3650 + }, + { + "epoch": 4.496314496314496, + "grad_norm": 0.8497583270072937, + "learning_rate": 0.0002, + "loss": 0.3731, + "step": 3660 + }, + { + "epoch": 4.5085995085995085, + "grad_norm": 0.8263831734657288, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 3670 + }, + { + "epoch": 4.520884520884521, + "grad_norm": 0.8470269441604614, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 3680 + }, + { + "epoch": 4.533169533169533, + "grad_norm": 0.860038161277771, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3690 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.8898552656173706, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3700 + }, + { + "epoch": 4.557739557739557, + "grad_norm": 0.8152070641517639, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 3710 + }, + { + "epoch": 4.57002457002457, + "grad_norm": 0.7847675085067749, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 3720 + }, + { + "epoch": 4.582309582309582, + "grad_norm": 0.9625533819198608, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3730 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.9097456336021423, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 3740 + }, + { + "epoch": 4.606879606879607, + "grad_norm": 0.871329128742218, + "learning_rate": 0.0002, + "loss": 0.3673, + "step": 3750 + }, + { + "epoch": 4.61916461916462, + "grad_norm": 0.9879975914955139, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 3760 + }, + { + "epoch": 4.631449631449631, + "grad_norm": 0.8636731505393982, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3770 + }, + { + "epoch": 4.643734643734644, + "grad_norm": 1.0488964319229126, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 3780 + }, + { + "epoch": 4.656019656019656, + "grad_norm": 0.7637056112289429, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3790 + }, + { + "epoch": 4.6683046683046685, + "grad_norm": 0.8507546186447144, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 3800 + }, + { + "epoch": 4.680589680589681, + "grad_norm": 1.0216856002807617, + "learning_rate": 0.0002, + "loss": 0.3852, + "step": 3810 + }, + { + "epoch": 4.6928746928746925, + "grad_norm": 1.026343822479248, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3820 + }, + { + "epoch": 4.705159705159705, + "grad_norm": 0.8311620950698853, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 3830 + }, + { + "epoch": 4.717444717444717, + "grad_norm": 0.7770653367042542, + "learning_rate": 0.0002, + "loss": 0.3771, + "step": 3840 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.7616215348243713, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3850 + }, + { + "epoch": 4.742014742014742, + "grad_norm": 1.0377072095870972, + "learning_rate": 0.0002, + "loss": 0.3927, + "step": 3860 + }, + { + "epoch": 4.754299754299755, + "grad_norm": 0.9713505506515503, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 3870 + }, + { + "epoch": 4.766584766584766, + "grad_norm": 0.8803321719169617, + "learning_rate": 0.0002, + "loss": 0.3722, + "step": 3880 + }, + { + "epoch": 4.778869778869779, + "grad_norm": 0.885535478591919, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 3890 + }, + { + "epoch": 4.791154791154791, + "grad_norm": 1.0877983570098877, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 3900 + }, + { + "epoch": 4.803439803439804, + "grad_norm": 0.7875366806983948, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3910 + }, + { + "epoch": 4.815724815724816, + "grad_norm": 0.8550102114677429, + "learning_rate": 0.0002, + "loss": 0.3591, + "step": 3920 + }, + { + "epoch": 4.828009828009828, + "grad_norm": 1.0217846632003784, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 3930 + }, + { + "epoch": 4.84029484029484, + "grad_norm": 0.7315713167190552, + "learning_rate": 0.0002, + "loss": 0.3649, + "step": 3940 + }, + { + "epoch": 4.8525798525798525, + "grad_norm": 0.8924923539161682, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3950 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.9730218052864075, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 3960 + }, + { + "epoch": 4.877149877149877, + "grad_norm": 0.9202003479003906, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 3970 + }, + { + "epoch": 4.88943488943489, + "grad_norm": 0.8173081874847412, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 3980 + }, + { + "epoch": 4.901719901719901, + "grad_norm": 0.7178564667701721, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3990 + }, + { + "epoch": 4.914004914004914, + "grad_norm": 0.913684606552124, + "learning_rate": 0.0002, + "loss": 0.3768, + "step": 4000 + }, + { + "epoch": 4.926289926289926, + "grad_norm": 0.8817896842956543, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 4010 + }, + { + "epoch": 4.938574938574939, + "grad_norm": 0.7652186751365662, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 4020 + }, + { + "epoch": 4.950859950859951, + "grad_norm": 0.8828630447387695, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 4030 + }, + { + "epoch": 4.963144963144963, + "grad_norm": 1.0878605842590332, + "learning_rate": 0.0002, + "loss": 0.3672, + "step": 4040 + }, + { + "epoch": 4.975429975429975, + "grad_norm": 1.0845288038253784, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 4050 + }, + { + "epoch": 4.987714987714988, + "grad_norm": 0.8431115746498108, + "learning_rate": 0.0002, + "loss": 0.365, + "step": 4060 + }, + { + "epoch": 5.0, + "grad_norm": 0.8320387601852417, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4070 + }, + { + "epoch": 5.0, + "eval_loss": 0.4017423093318939, + "eval_runtime": 20.8466, + "eval_samples_per_second": 15.878, + "eval_steps_per_second": 2.015, + "step": 4070 + }, + { + "epoch": 5.012285012285012, + "grad_norm": 0.8639023900032043, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 4080 + }, + { + "epoch": 5.024570024570025, + "grad_norm": 0.7123713493347168, + "learning_rate": 0.0002, + "loss": 0.3458, + "step": 4090 + }, + { + "epoch": 5.036855036855036, + "grad_norm": 0.9886922836303711, + "learning_rate": 0.0002, + "loss": 0.3404, + "step": 4100 + }, + { + "epoch": 5.049140049140049, + "grad_norm": 0.7880306243896484, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 4110 + }, + { + "epoch": 5.061425061425061, + "grad_norm": 0.7488741874694824, + "learning_rate": 0.0002, + "loss": 0.3406, + "step": 4120 + }, + { + "epoch": 5.073710073710074, + "grad_norm": 0.9359086751937866, + "learning_rate": 0.0002, + "loss": 0.3542, + "step": 4130 + }, + { + "epoch": 5.085995085995086, + "grad_norm": 0.9401527047157288, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 4140 + }, + { + "epoch": 5.098280098280099, + "grad_norm": 0.8396275043487549, + "learning_rate": 0.0002, + "loss": 0.3566, + "step": 4150 + }, + { + "epoch": 5.11056511056511, + "grad_norm": 0.7132664918899536, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4160 + }, + { + "epoch": 5.122850122850123, + "grad_norm": 0.843708872795105, + "learning_rate": 0.0002, + "loss": 0.3457, + "step": 4170 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.8733304738998413, + "learning_rate": 0.0002, + "loss": 0.3399, + "step": 4180 + }, + { + "epoch": 5.1474201474201475, + "grad_norm": 0.9064375162124634, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4190 + }, + { + "epoch": 5.15970515970516, + "grad_norm": 0.900770902633667, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4200 + }, + { + "epoch": 5.171990171990172, + "grad_norm": 0.863853394985199, + "learning_rate": 0.0002, + "loss": 0.3475, + "step": 4210 + }, + { + "epoch": 5.184275184275184, + "grad_norm": 0.767134964466095, + "learning_rate": 0.0002, + "loss": 0.3497, + "step": 4220 + }, + { + "epoch": 5.196560196560196, + "grad_norm": 0.7518735527992249, + "learning_rate": 0.0002, + "loss": 0.3527, + "step": 4230 + }, + { + "epoch": 5.208845208845209, + "grad_norm": 0.8040947914123535, + "learning_rate": 0.0002, + "loss": 0.3369, + "step": 4240 + }, + { + "epoch": 5.221130221130221, + "grad_norm": 0.7827144265174866, + "learning_rate": 0.0002, + "loss": 0.3496, + "step": 4250 + }, + { + "epoch": 5.233415233415234, + "grad_norm": 0.7306333184242249, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4260 + }, + { + "epoch": 5.245700245700245, + "grad_norm": 1.0963380336761475, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4270 + }, + { + "epoch": 5.257985257985258, + "grad_norm": 0.8200454711914062, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4280 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 0.8666796684265137, + "learning_rate": 0.0002, + "loss": 0.3509, + "step": 4290 + }, + { + "epoch": 5.282555282555283, + "grad_norm": 0.7862894535064697, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 4300 + }, + { + "epoch": 5.294840294840295, + "grad_norm": 0.8163095712661743, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4310 + }, + { + "epoch": 5.3071253071253075, + "grad_norm": 0.8069050908088684, + "learning_rate": 0.0002, + "loss": 0.34, + "step": 4320 + }, + { + "epoch": 5.319410319410319, + "grad_norm": 0.7858486175537109, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 4330 + }, + { + "epoch": 5.3316953316953315, + "grad_norm": 0.950339674949646, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4340 + }, + { + "epoch": 5.343980343980344, + "grad_norm": 0.9056477546691895, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4350 + }, + { + "epoch": 5.356265356265356, + "grad_norm": 0.9619399905204773, + "learning_rate": 0.0002, + "loss": 0.3538, + "step": 4360 + }, + { + "epoch": 5.368550368550369, + "grad_norm": 0.9778652191162109, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4370 + }, + { + "epoch": 5.38083538083538, + "grad_norm": 0.6919555068016052, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4380 + }, + { + "epoch": 5.393120393120393, + "grad_norm": 0.8121668696403503, + "learning_rate": 0.0002, + "loss": 0.3426, + "step": 4390 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.8481289148330688, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4400 + }, + { + "epoch": 5.417690417690418, + "grad_norm": 0.8727408647537231, + "learning_rate": 0.0002, + "loss": 0.345, + "step": 4410 + }, + { + "epoch": 5.42997542997543, + "grad_norm": 0.8920271396636963, + "learning_rate": 0.0002, + "loss": 0.3554, + "step": 4420 + }, + { + "epoch": 5.442260442260443, + "grad_norm": 0.7758749723434448, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4430 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.8847506642341614, + "learning_rate": 0.0002, + "loss": 0.3483, + "step": 4440 + }, + { + "epoch": 5.466830466830467, + "grad_norm": 0.9760470390319824, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 4450 + }, + { + "epoch": 5.479115479115479, + "grad_norm": 0.8940271139144897, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 4460 + }, + { + "epoch": 5.4914004914004915, + "grad_norm": 0.8668502569198608, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 4470 + }, + { + "epoch": 5.503685503685504, + "grad_norm": 0.9097439050674438, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4480 + }, + { + "epoch": 5.515970515970516, + "grad_norm": 0.8217208981513977, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 4490 + }, + { + "epoch": 5.528255528255528, + "grad_norm": 0.7853189706802368, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4500 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 1.1113477945327759, + "learning_rate": 0.0002, + "loss": 0.3479, + "step": 4510 + }, + { + "epoch": 5.552825552825553, + "grad_norm": 0.8637538552284241, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4520 + }, + { + "epoch": 5.565110565110565, + "grad_norm": 1.0230066776275635, + "learning_rate": 0.0002, + "loss": 0.3403, + "step": 4530 + }, + { + "epoch": 5.577395577395578, + "grad_norm": 0.8972793817520142, + "learning_rate": 0.0002, + "loss": 0.3588, + "step": 4540 + }, + { + "epoch": 5.58968058968059, + "grad_norm": 0.7950642704963684, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 4550 + }, + { + "epoch": 5.601965601965602, + "grad_norm": 1.113753318786621, + "learning_rate": 0.0002, + "loss": 0.3468, + "step": 4560 + }, + { + "epoch": 5.614250614250614, + "grad_norm": 0.7842669486999512, + "learning_rate": 0.0002, + "loss": 0.3354, + "step": 4570 + }, + { + "epoch": 5.6265356265356266, + "grad_norm": 0.9713512063026428, + "learning_rate": 0.0002, + "loss": 0.3419, + "step": 4580 + }, + { + "epoch": 5.638820638820639, + "grad_norm": 0.9451650977134705, + "learning_rate": 0.0002, + "loss": 0.3502, + "step": 4590 + }, + { + "epoch": 5.651105651105651, + "grad_norm": 1.055484414100647, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4600 + }, + { + "epoch": 5.663390663390663, + "grad_norm": 0.8408507704734802, + "learning_rate": 0.0002, + "loss": 0.3436, + "step": 4610 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 1.0293926000595093, + "learning_rate": 0.0002, + "loss": 0.3619, + "step": 4620 + }, + { + "epoch": 5.687960687960688, + "grad_norm": 0.7198245525360107, + "learning_rate": 0.0002, + "loss": 0.3484, + "step": 4630 + }, + { + "epoch": 5.7002457002457, + "grad_norm": 0.7564466595649719, + "learning_rate": 0.0002, + "loss": 0.3563, + "step": 4640 + }, + { + "epoch": 5.712530712530713, + "grad_norm": 0.7980002760887146, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4650 + }, + { + "epoch": 5.724815724815725, + "grad_norm": 0.8685088753700256, + "learning_rate": 0.0002, + "loss": 0.3478, + "step": 4660 + }, + { + "epoch": 5.737100737100737, + "grad_norm": 0.8816949129104614, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 4670 + }, + { + "epoch": 5.749385749385749, + "grad_norm": 0.7154731750488281, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4680 + }, + { + "epoch": 5.761670761670762, + "grad_norm": 0.9430679678916931, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 4690 + }, + { + "epoch": 5.773955773955774, + "grad_norm": 0.7640151381492615, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4700 + }, + { + "epoch": 5.7862407862407865, + "grad_norm": 1.0920690298080444, + "learning_rate": 0.0002, + "loss": 0.3444, + "step": 4710 + }, + { + "epoch": 5.798525798525798, + "grad_norm": 0.9362104535102844, + "learning_rate": 0.0002, + "loss": 0.3356, + "step": 4720 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.8392294645309448, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 4730 + }, + { + "epoch": 5.823095823095823, + "grad_norm": 0.9893582463264465, + "learning_rate": 0.0002, + "loss": 0.3488, + "step": 4740 + }, + { + "epoch": 5.835380835380835, + "grad_norm": 0.6985510587692261, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 4750 + }, + { + "epoch": 5.847665847665848, + "grad_norm": 0.8906862735748291, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 4760 + }, + { + "epoch": 5.85995085995086, + "grad_norm": 0.8036413192749023, + "learning_rate": 0.0002, + "loss": 0.3481, + "step": 4770 + }, + { + "epoch": 5.872235872235873, + "grad_norm": 0.9948155283927917, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 4780 + }, + { + "epoch": 5.884520884520884, + "grad_norm": 0.8618432283401489, + "learning_rate": 0.0002, + "loss": 0.3385, + "step": 4790 + }, + { + "epoch": 5.896805896805897, + "grad_norm": 1.0422909259796143, + "learning_rate": 0.0002, + "loss": 0.3302, + "step": 4800 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 1.1892569065093994, + "learning_rate": 0.0002, + "loss": 0.3448, + "step": 4810 + }, + { + "epoch": 5.921375921375922, + "grad_norm": 1.1459916830062866, + "learning_rate": 0.0002, + "loss": 0.3506, + "step": 4820 + }, + { + "epoch": 5.933660933660933, + "grad_norm": 1.056235909461975, + "learning_rate": 0.0002, + "loss": 0.3387, + "step": 4830 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 0.8517277240753174, + "learning_rate": 0.0002, + "loss": 0.344, + "step": 4840 + }, + { + "epoch": 5.958230958230958, + "grad_norm": 0.8153380751609802, + "learning_rate": 0.0002, + "loss": 0.3421, + "step": 4850 + }, + { + "epoch": 5.9705159705159705, + "grad_norm": 0.7907533049583435, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4860 + }, + { + "epoch": 5.982800982800983, + "grad_norm": 0.8443069458007812, + "learning_rate": 0.0002, + "loss": 0.3337, + "step": 4870 + }, + { + "epoch": 5.995085995085995, + "grad_norm": 0.8711344003677368, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 4880 + }, + { + "epoch": 6.0, + "eval_loss": 0.3778059184551239, + "eval_runtime": 20.6858, + "eval_samples_per_second": 16.001, + "eval_steps_per_second": 2.03, + "step": 4884 + }, + { + "epoch": 6.007371007371007, + "grad_norm": 0.7697948813438416, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 4890 + }, + { + "epoch": 6.019656019656019, + "grad_norm": 0.7734108567237854, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4900 + }, + { + "epoch": 6.031941031941032, + "grad_norm": 0.7173922657966614, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 4910 + }, + { + "epoch": 6.044226044226044, + "grad_norm": 1.062118649482727, + "learning_rate": 0.0002, + "loss": 0.3159, + "step": 4920 + }, + { + "epoch": 6.056511056511057, + "grad_norm": 0.746422529220581, + "learning_rate": 0.0002, + "loss": 0.3361, + "step": 4930 + }, + { + "epoch": 6.068796068796069, + "grad_norm": 0.8549448251724243, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 4940 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 0.9405432939529419, + "learning_rate": 0.0002, + "loss": 0.3236, + "step": 4950 + }, + { + "epoch": 6.093366093366093, + "grad_norm": 0.752382755279541, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 4960 + }, + { + "epoch": 6.105651105651106, + "grad_norm": 0.820332407951355, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 4970 + }, + { + "epoch": 6.117936117936118, + "grad_norm": 0.8701449036598206, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 4980 + }, + { + "epoch": 6.1302211302211305, + "grad_norm": 0.8192865252494812, + "learning_rate": 0.0002, + "loss": 0.321, + "step": 4990 + }, + { + "epoch": 6.142506142506143, + "grad_norm": 1.0016303062438965, + "learning_rate": 0.0002, + "loss": 0.3295, + "step": 5000 + }, + { + "epoch": 6.1547911547911545, + "grad_norm": 0.9194409251213074, + "learning_rate": 0.0002, + "loss": 0.3352, + "step": 5010 + }, + { + "epoch": 6.167076167076167, + "grad_norm": 0.9319757223129272, + "learning_rate": 0.0002, + "loss": 0.3205, + "step": 5020 + }, + { + "epoch": 6.179361179361179, + "grad_norm": 0.8737656474113464, + "learning_rate": 0.0002, + "loss": 0.3256, + "step": 5030 + }, + { + "epoch": 6.191646191646192, + "grad_norm": 0.8736537098884583, + "learning_rate": 0.0002, + "loss": 0.3221, + "step": 5040 + }, + { + "epoch": 6.203931203931204, + "grad_norm": 0.9301430583000183, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 5050 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.7717130780220032, + "learning_rate": 0.0002, + "loss": 0.3285, + "step": 5060 + }, + { + "epoch": 6.228501228501228, + "grad_norm": 0.6709604859352112, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 5070 + }, + { + "epoch": 6.240786240786241, + "grad_norm": 0.879374086856842, + "learning_rate": 0.0002, + "loss": 0.3352, + "step": 5080 + }, + { + "epoch": 6.253071253071253, + "grad_norm": 0.9136955738067627, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 5090 + }, + { + "epoch": 6.2653562653562656, + "grad_norm": 0.795177161693573, + "learning_rate": 0.0002, + "loss": 0.3228, + "step": 5100 + }, + { + "epoch": 6.277641277641278, + "grad_norm": 1.0412259101867676, + "learning_rate": 0.0002, + "loss": 0.3273, + "step": 5110 + }, + { + "epoch": 6.2899262899262895, + "grad_norm": 0.7382524013519287, + "learning_rate": 0.0002, + "loss": 0.3221, + "step": 5120 + }, + { + "epoch": 6.302211302211302, + "grad_norm": 0.8818480968475342, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 5130 + }, + { + "epoch": 6.314496314496314, + "grad_norm": 0.7865153551101685, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 5140 + }, + { + "epoch": 6.326781326781327, + "grad_norm": 0.9166486859321594, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 5150 + }, + { + "epoch": 6.339066339066339, + "grad_norm": 0.6655149459838867, + "learning_rate": 0.0002, + "loss": 0.33, + "step": 5160 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 0.7762818336486816, + "learning_rate": 0.0002, + "loss": 0.3359, + "step": 5170 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.8057235479354858, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 5180 + }, + { + "epoch": 6.375921375921376, + "grad_norm": 0.8186984062194824, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 5190 + }, + { + "epoch": 6.388206388206388, + "grad_norm": 0.8669573068618774, + "learning_rate": 0.0002, + "loss": 0.3289, + "step": 5200 + }, + { + "epoch": 6.400491400491401, + "grad_norm": 0.8904402852058411, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 5210 + }, + { + "epoch": 6.412776412776413, + "grad_norm": 0.9250359535217285, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 5220 + }, + { + "epoch": 6.4250614250614255, + "grad_norm": 0.8718299269676208, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 5230 + }, + { + "epoch": 6.437346437346437, + "grad_norm": 0.8156430125236511, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 5240 + }, + { + "epoch": 6.4496314496314495, + "grad_norm": 0.7759218215942383, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 5250 + }, + { + "epoch": 6.461916461916462, + "grad_norm": 0.8137310743331909, + "learning_rate": 0.0002, + "loss": 0.3298, + "step": 5260 + }, + { + "epoch": 6.474201474201474, + "grad_norm": 0.8121917843818665, + "learning_rate": 0.0002, + "loss": 0.3275, + "step": 5270 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 0.8178010582923889, + "learning_rate": 0.0002, + "loss": 0.3201, + "step": 5280 + }, + { + "epoch": 6.498771498771498, + "grad_norm": 1.1806302070617676, + "learning_rate": 0.0002, + "loss": 0.3271, + "step": 5290 + }, + { + "epoch": 6.511056511056511, + "grad_norm": 0.8255127668380737, + "learning_rate": 0.0002, + "loss": 0.3231, + "step": 5300 + }, + { + "epoch": 6.523341523341523, + "grad_norm": 0.8006690740585327, + "learning_rate": 0.0002, + "loss": 0.3227, + "step": 5310 + }, + { + "epoch": 6.535626535626536, + "grad_norm": 0.9932374358177185, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 5320 + }, + { + "epoch": 6.547911547911548, + "grad_norm": 0.8973969221115112, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 5330 + }, + { + "epoch": 6.560196560196561, + "grad_norm": 0.7359915971755981, + "learning_rate": 0.0002, + "loss": 0.3146, + "step": 5340 + }, + { + "epoch": 6.572481572481572, + "grad_norm": 0.9941133856773376, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 5350 + }, + { + "epoch": 6.584766584766585, + "grad_norm": 0.9008874893188477, + "learning_rate": 0.0002, + "loss": 0.3202, + "step": 5360 + }, + { + "epoch": 6.597051597051597, + "grad_norm": 1.309710144996643, + "learning_rate": 0.0002, + "loss": 0.3271, + "step": 5370 + }, + { + "epoch": 6.6093366093366095, + "grad_norm": 0.797768235206604, + "learning_rate": 0.0002, + "loss": 0.3177, + "step": 5380 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 0.8507353663444519, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5390 + }, + { + "epoch": 6.6339066339066335, + "grad_norm": 0.9628674983978271, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 5400 + }, + { + "epoch": 6.646191646191646, + "grad_norm": 0.6989983320236206, + "learning_rate": 0.0002, + "loss": 0.3155, + "step": 5410 + }, + { + "epoch": 6.658476658476658, + "grad_norm": 0.9505863189697266, + "learning_rate": 0.0002, + "loss": 0.3197, + "step": 5420 + }, + { + "epoch": 6.670761670761671, + "grad_norm": 0.8058171272277832, + "learning_rate": 0.0002, + "loss": 0.3259, + "step": 5430 + }, + { + "epoch": 6.683046683046683, + "grad_norm": 0.8476499915122986, + "learning_rate": 0.0002, + "loss": 0.3248, + "step": 5440 + }, + { + "epoch": 6.695331695331696, + "grad_norm": 0.8503309488296509, + "learning_rate": 0.0002, + "loss": 0.326, + "step": 5450 + }, + { + "epoch": 6.707616707616707, + "grad_norm": 0.919566810131073, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5460 + }, + { + "epoch": 6.71990171990172, + "grad_norm": 0.7741201519966125, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5470 + }, + { + "epoch": 6.732186732186732, + "grad_norm": 0.8432701826095581, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 5480 + }, + { + "epoch": 6.744471744471745, + "grad_norm": 1.0183148384094238, + "learning_rate": 0.0002, + "loss": 0.3284, + "step": 5490 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 0.8491143584251404, + "learning_rate": 0.0002, + "loss": 0.3312, + "step": 5500 + }, + { + "epoch": 6.769041769041769, + "grad_norm": 0.9586310386657715, + "learning_rate": 0.0002, + "loss": 0.3208, + "step": 5510 + }, + { + "epoch": 6.781326781326781, + "grad_norm": 0.7936097383499146, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 5520 + }, + { + "epoch": 6.7936117936117935, + "grad_norm": 0.7875059247016907, + "learning_rate": 0.0002, + "loss": 0.318, + "step": 5530 + }, + { + "epoch": 6.805896805896806, + "grad_norm": 0.8136157393455505, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 5540 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.837213933467865, + "learning_rate": 0.0002, + "loss": 0.3161, + "step": 5550 + }, + { + "epoch": 6.830466830466831, + "grad_norm": 0.6812925338745117, + "learning_rate": 0.0002, + "loss": 0.3153, + "step": 5560 + }, + { + "epoch": 6.842751842751843, + "grad_norm": 0.7309592962265015, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 5570 + }, + { + "epoch": 6.855036855036855, + "grad_norm": 0.6905979514122009, + "learning_rate": 0.0002, + "loss": 0.3126, + "step": 5580 + }, + { + "epoch": 6.867321867321867, + "grad_norm": 1.1768406629562378, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 5590 + }, + { + "epoch": 6.87960687960688, + "grad_norm": 0.7618567943572998, + "learning_rate": 0.0002, + "loss": 0.3193, + "step": 5600 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 0.7930929660797119, + "learning_rate": 0.0002, + "loss": 0.3296, + "step": 5610 + }, + { + "epoch": 6.9041769041769046, + "grad_norm": 0.7931787371635437, + "learning_rate": 0.0002, + "loss": 0.3241, + "step": 5620 + }, + { + "epoch": 6.916461916461916, + "grad_norm": 0.6366972923278809, + "learning_rate": 0.0002, + "loss": 0.3215, + "step": 5630 + }, + { + "epoch": 6.9287469287469285, + "grad_norm": 0.7782737612724304, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 5640 + }, + { + "epoch": 6.941031941031941, + "grad_norm": 0.8643787503242493, + "learning_rate": 0.0002, + "loss": 0.3186, + "step": 5650 + }, + { + "epoch": 6.953316953316953, + "grad_norm": 1.0843733549118042, + "learning_rate": 0.0002, + "loss": 0.3285, + "step": 5660 + }, + { + "epoch": 6.965601965601966, + "grad_norm": 0.71319180727005, + "learning_rate": 0.0002, + "loss": 0.3163, + "step": 5670 + }, + { + "epoch": 6.977886977886978, + "grad_norm": 0.976536750793457, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 5680 + }, + { + "epoch": 6.99017199017199, + "grad_norm": 0.9221968054771423, + "learning_rate": 0.0002, + "loss": 0.3255, + "step": 5690 + }, + { + "epoch": 7.0, + "eval_loss": 0.3616626560688019, + "eval_runtime": 20.8747, + "eval_samples_per_second": 15.857, + "eval_steps_per_second": 2.012, + "step": 5698 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.964093511742259e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..952cf5145a9d7e80dc7491c40196d56467129957 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34589486cf3edd2be3ba9bcb7e87563196b7a9856dd90683bdf986b557d160d3 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f1bf087da1004aab0e3f30c9e9fef9b06a5a3f8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06ba0cfae53e85afd2511561a38a6f578a5bc0a036eecc8e2bd842ec8dbb8679 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..459298bb39602affb600d0808d13fa7d1ce72240 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430471ab88b23b53bb64769e9f9baf75a650e45afe6ed9ee8dbf521fa822e78e +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebe13cf064eed1a29ad4b1fc8ba5b1a8bce7e6cf --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85386f3e84bf3fcbddc99632d468aef6e17dd9860afd628d6b9c360a7a9b8cfe +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af95cf5b26586ddc33c849b4bbaba306e5398ca9 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/trainer_state.json @@ -0,0 +1,4654 @@ +{ + "best_metric": 0.3476468026638031, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 6512, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.7405961751937866, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 820 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.8534344434738159, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 830 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.7415764331817627, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 840 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.74293053150177, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 850 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.697727382183075, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 860 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.8022570013999939, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 870 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7545800805091858, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 880 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 890 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.7681778073310852, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 900 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.7822468876838684, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 910 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.8324839472770691, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 920 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.8206289410591125, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 930 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.786461591720581, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 940 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.8288539052009583, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 950 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.7566865682601929, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 960 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.7761894464492798, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 970 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.7608440518379211, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 980 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.799745500087738, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 990 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.8135330677032471, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1000 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.7410391569137573, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 1010 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.7826172709465027, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 1020 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.7210677862167358, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 1030 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.7571766972541809, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 1040 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.8602666258811951, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 1050 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.8640648722648621, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 1060 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.7289374470710754, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 1070 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.8099908828735352, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 1080 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 1090 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.900576114654541, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 1100 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.729603111743927, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 1110 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.8350434303283691, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 1120 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.8049437999725342, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 1130 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.8222764134407043, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 1140 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.7949751019477844, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 1150 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.8375639915466309, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 1160 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 0.7261053919792175, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 1170 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.6918320655822754, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 1180 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 0.8148727416992188, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 1190 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.7014724612236023, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 1200 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8110846281051636, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 1210 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.8336407542228699, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.826996386051178, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 1230 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.7503120303153992, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 1240 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.8297192454338074, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.7585996985435486, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 1260 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.7530493140220642, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 1270 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.8141939640045166, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 1280 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.6959931254386902, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 1290 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.8677428364753723, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 1300 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.8527476787567139, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 1310 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.8462157845497131, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 1320 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.9371153712272644, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.8408344984054565, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 1340 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.8391859531402588, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 1350 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.7630598545074463, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 1360 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.8007895350456238, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 1370 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 1380 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.7779742479324341, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 1390 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.712293803691864, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 1400 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.8503297567367554, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 1410 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.8312245607376099, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 1420 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7758049368858337, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 1430 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.8695956468582153, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 1440 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.7785261273384094, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 1450 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.7091802358627319, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 1460 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.774146556854248, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 1470 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.8342524170875549, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 1480 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.8087738156318665, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 1490 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.9830479621887207, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 1500 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.8537567853927612, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 1510 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.8004562854766846, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 1520 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.8161284327507019, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 1530 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.8688093423843384, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 1540 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.8287379741668701, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 1550 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.8050342202186584, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 1560 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.9273895621299744, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 1570 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.8416891694068909, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 1580 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.7299820184707642, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 1590 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.7262272834777832, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 1600 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.8649004697799683, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 1610 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.8165444731712341, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 0.5858802795410156, + "eval_runtime": 22.6585, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 1.854, + "step": 1628 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.8142582178115845, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 1630 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 1.0637224912643433, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1640 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.8923280239105225, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 1650 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.8169175386428833, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 1660 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.8124040365219116, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.9228773713111877, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 1680 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.7216871380805969, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 1690 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.8679503202438354, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.8627730011940002, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 1710 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.9175152778625488, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 1720 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.7930372953414917, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 1730 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.8370155692100525, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 1740 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.9121434688568115, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 1750 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.8703579306602478, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 1760 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9270512461662292, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 1770 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.9372949600219727, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 1780 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.8955178260803223, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1790 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.846102237701416, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 1800 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.9186713099479675, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 1810 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.7695123553276062, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 1820 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.7340332865715027, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 1830 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.8933137655258179, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 1840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7705038189888, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 1850 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.8396083116531372, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 1860 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.7695736289024353, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 1870 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.8535045385360718, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 1880 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.8549142479896545, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 1890 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.9124433994293213, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 1900 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.855523943901062, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 1910 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.810878336429596, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 1920 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.7409024834632874, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 1930 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.8080927729606628, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 1940 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.9661469459533691, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 1950 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.838766872882843, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 1960 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.8737491965293884, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 1970 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.8657792210578918, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 1980 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.8883858919143677, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 1990 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.8647662997245789, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2000 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.896037757396698, + "learning_rate": 0.0002, + "loss": 0.518, + "step": 2010 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.8079167008399963, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 2020 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 1.0293292999267578, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 2030 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.8459244966506958, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 2040 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.9244982600212097, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 2050 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.8245007991790771, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 2060 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.8869297504425049, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2070 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.8620884418487549, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2080 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.8387904167175293, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 2090 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.8353935480117798, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 2100 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 1.0136934518814087, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 2110 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.9387392997741699, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 2120 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.898697555065155, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 2130 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 1.0145231485366821, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 2140 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.8335273265838623, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 2150 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 1.0198529958724976, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 2160 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.8353323340415955, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 2170 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.8831406831741333, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 2180 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.7182748913764954, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 2190 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.7892552614212036, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 2200 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 1.0144033432006836, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2210 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0913645029067993, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 2220 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 1.014394998550415, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2230 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.8118020296096802, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2240 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 0.9027737379074097, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.8017747402191162, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 2260 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.788362979888916, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 2270 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.8338918089866638, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 2280 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 2290 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.9319674372673035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 2300 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.8632726073265076, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 2310 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.785464882850647, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 2320 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.8159732818603516, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 2330 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.8702368140220642, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 2340 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 1.0456738471984863, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 2350 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 1.0855203866958618, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 2360 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.9378156065940857, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 2370 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.7390182018280029, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 2380 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.7667133212089539, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 2390 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.8633476495742798, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 2400 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 1.0821104049682617, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 2410 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 2420 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 2430 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.8066530823707581, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 2440 + }, + { + "epoch": 3.0, + "eval_loss": 0.49752503633499146, + "eval_runtime": 20.2911, + "eval_samples_per_second": 16.313, + "eval_steps_per_second": 2.07, + "step": 2442 + }, + { + "epoch": 3.0098280098280097, + "grad_norm": 0.7644656896591187, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 2450 + }, + { + "epoch": 3.022113022113022, + "grad_norm": 0.9077525734901428, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2460 + }, + { + "epoch": 3.0343980343980346, + "grad_norm": 0.7859287261962891, + "learning_rate": 0.0002, + "loss": 0.422, + "step": 2470 + }, + { + "epoch": 3.0466830466830466, + "grad_norm": 1.1200323104858398, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 2480 + }, + { + "epoch": 3.058968058968059, + "grad_norm": 0.7570453882217407, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 2490 + }, + { + "epoch": 3.0712530712530715, + "grad_norm": 0.9450915455818176, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 2500 + }, + { + "epoch": 3.0835380835380835, + "grad_norm": 0.8303545117378235, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 2510 + }, + { + "epoch": 3.095823095823096, + "grad_norm": 0.8864443898200989, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2520 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.945324718952179, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 2530 + }, + { + "epoch": 3.1203931203931203, + "grad_norm": 1.0562494993209839, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 2540 + }, + { + "epoch": 3.1326781326781328, + "grad_norm": 0.8607500195503235, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2550 + }, + { + "epoch": 3.1449631449631448, + "grad_norm": 0.8719640374183655, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 2560 + }, + { + "epoch": 3.157248157248157, + "grad_norm": 0.8647059202194214, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 2570 + }, + { + "epoch": 3.1695331695331697, + "grad_norm": 0.8346507549285889, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 2580 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 1.0208854675292969, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 2590 + }, + { + "epoch": 3.194103194103194, + "grad_norm": 0.7064385414123535, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 2600 + }, + { + "epoch": 3.2063882063882065, + "grad_norm": 0.927347719669342, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 2610 + }, + { + "epoch": 3.2186732186732185, + "grad_norm": 0.943517804145813, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 2620 + }, + { + "epoch": 3.230958230958231, + "grad_norm": 0.7837198376655579, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 2630 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.7752765417098999, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 2640 + }, + { + "epoch": 3.2555282555282554, + "grad_norm": 0.8578953146934509, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2650 + }, + { + "epoch": 3.267813267813268, + "grad_norm": 1.0209529399871826, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2660 + }, + { + "epoch": 3.2800982800982803, + "grad_norm": 0.9069030284881592, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 2670 + }, + { + "epoch": 3.2923832923832923, + "grad_norm": 0.8454729318618774, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 2680 + }, + { + "epoch": 3.3046683046683047, + "grad_norm": 0.8253099322319031, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 3.3169533169533167, + "grad_norm": 0.8765934109687805, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 2700 + }, + { + "epoch": 3.329238329238329, + "grad_norm": 0.8149126172065735, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 2710 + }, + { + "epoch": 3.3415233415233416, + "grad_norm": 0.8820102214813232, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2720 + }, + { + "epoch": 3.3538083538083536, + "grad_norm": 0.8813952803611755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 2730 + }, + { + "epoch": 3.366093366093366, + "grad_norm": 1.0338447093963623, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 2740 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.0002, + "loss": 0.4468, + "step": 2750 + }, + { + "epoch": 3.3906633906633905, + "grad_norm": 0.9017151594161987, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 2760 + }, + { + "epoch": 3.402948402948403, + "grad_norm": 0.8647638559341431, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 2770 + }, + { + "epoch": 3.4152334152334154, + "grad_norm": 0.8298183679580688, + "learning_rate": 0.0002, + "loss": 0.4131, + "step": 2780 + }, + { + "epoch": 3.4275184275184274, + "grad_norm": 0.9298108816146851, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 2790 + }, + { + "epoch": 3.43980343980344, + "grad_norm": 0.8909980058670044, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 2800 + }, + { + "epoch": 3.4520884520884523, + "grad_norm": 0.8027496933937073, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2810 + }, + { + "epoch": 3.4643734643734643, + "grad_norm": 0.8766195774078369, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 2820 + }, + { + "epoch": 3.4766584766584767, + "grad_norm": 0.8194443583488464, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 2830 + }, + { + "epoch": 3.488943488943489, + "grad_norm": 0.9862873554229736, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 2840 + }, + { + "epoch": 3.501228501228501, + "grad_norm": 0.8755377531051636, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 2850 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.7300266027450562, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 2860 + }, + { + "epoch": 3.5257985257985256, + "grad_norm": 0.8342461585998535, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 2870 + }, + { + "epoch": 3.538083538083538, + "grad_norm": 0.8624151349067688, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 2880 + }, + { + "epoch": 3.5503685503685505, + "grad_norm": 0.8931261301040649, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 2890 + }, + { + "epoch": 3.562653562653563, + "grad_norm": 0.8617086410522461, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 2900 + }, + { + "epoch": 3.574938574938575, + "grad_norm": 0.8754099607467651, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 2910 + }, + { + "epoch": 3.5872235872235874, + "grad_norm": 0.8345834612846375, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2920 + }, + { + "epoch": 3.5995085995085994, + "grad_norm": 1.1414062976837158, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 2930 + }, + { + "epoch": 3.611793611793612, + "grad_norm": 0.994860053062439, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 2940 + }, + { + "epoch": 3.6240786240786242, + "grad_norm": 1.19268000125885, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 2950 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8399543762207031, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.9873217940330505, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 2970 + }, + { + "epoch": 3.6609336609336607, + "grad_norm": 0.9116013646125793, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 2980 + }, + { + "epoch": 3.673218673218673, + "grad_norm": 0.9503833651542664, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 2990 + }, + { + "epoch": 3.6855036855036856, + "grad_norm": 0.9401112794876099, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 3000 + }, + { + "epoch": 3.697788697788698, + "grad_norm": 1.00745689868927, + "learning_rate": 0.0002, + "loss": 0.4333, + "step": 3010 + }, + { + "epoch": 3.71007371007371, + "grad_norm": 1.0553191900253296, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3020 + }, + { + "epoch": 3.7223587223587224, + "grad_norm": 1.0226953029632568, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 3030 + }, + { + "epoch": 3.7346437346437344, + "grad_norm": 1.085554838180542, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 3040 + }, + { + "epoch": 3.746928746928747, + "grad_norm": 0.9948731064796448, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 3050 + }, + { + "epoch": 3.7592137592137593, + "grad_norm": 0.9328727126121521, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 3060 + }, + { + "epoch": 3.7714987714987718, + "grad_norm": 1.0533266067504883, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3070 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.8213809132575989, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 3080 + }, + { + "epoch": 3.796068796068796, + "grad_norm": 0.8941594362258911, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 3090 + }, + { + "epoch": 3.808353808353808, + "grad_norm": 0.8324518203735352, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 3100 + }, + { + "epoch": 3.8206388206388207, + "grad_norm": 0.8811233639717102, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 3110 + }, + { + "epoch": 3.832923832923833, + "grad_norm": 0.8781470060348511, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 3120 + }, + { + "epoch": 3.845208845208845, + "grad_norm": 0.8994116187095642, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3130 + }, + { + "epoch": 3.8574938574938575, + "grad_norm": 0.8605017066001892, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 3140 + }, + { + "epoch": 3.8697788697788695, + "grad_norm": 0.8966400027275085, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 3150 + }, + { + "epoch": 3.882063882063882, + "grad_norm": 0.8856554627418518, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 3160 + }, + { + "epoch": 3.8943488943488944, + "grad_norm": 0.8971620798110962, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 3170 + }, + { + "epoch": 3.906633906633907, + "grad_norm": 0.9807813167572021, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 3180 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.8614121675491333, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 3190 + }, + { + "epoch": 3.9312039312039313, + "grad_norm": 0.989171028137207, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 3200 + }, + { + "epoch": 3.9434889434889433, + "grad_norm": 0.8168872594833374, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 3210 + }, + { + "epoch": 3.9557739557739557, + "grad_norm": 0.8109386563301086, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3220 + }, + { + "epoch": 3.968058968058968, + "grad_norm": 1.0175853967666626, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 3230 + }, + { + "epoch": 3.98034398034398, + "grad_norm": 0.936143159866333, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3240 + }, + { + "epoch": 3.9926289926289926, + "grad_norm": 0.9557915925979614, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3250 + }, + { + "epoch": 4.0, + "eval_loss": 0.4401616156101227, + "eval_runtime": 20.8047, + "eval_samples_per_second": 15.91, + "eval_steps_per_second": 2.019, + "step": 3256 + }, + { + "epoch": 4.004914004914005, + "grad_norm": 0.7590614557266235, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 3260 + }, + { + "epoch": 4.017199017199017, + "grad_norm": 0.8920791149139404, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 3270 + }, + { + "epoch": 4.0294840294840295, + "grad_norm": 0.8640421628952026, + "learning_rate": 0.0002, + "loss": 0.3789, + "step": 3280 + }, + { + "epoch": 4.041769041769042, + "grad_norm": 0.9074113965034485, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3290 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 1.0600885152816772, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 3300 + }, + { + "epoch": 4.066339066339066, + "grad_norm": 0.9682773351669312, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 3310 + }, + { + "epoch": 4.078624078624078, + "grad_norm": 0.9326395392417908, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 3320 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.8886597156524658, + "learning_rate": 0.0002, + "loss": 0.3823, + "step": 3330 + }, + { + "epoch": 4.103194103194103, + "grad_norm": 1.032205581665039, + "learning_rate": 0.0002, + "loss": 0.3929, + "step": 3340 + }, + { + "epoch": 4.115479115479116, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 3350 + }, + { + "epoch": 4.127764127764128, + "grad_norm": 0.8250347971916199, + "learning_rate": 0.0002, + "loss": 0.3866, + "step": 3360 + }, + { + "epoch": 4.14004914004914, + "grad_norm": 0.7919842600822449, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3370 + }, + { + "epoch": 4.152334152334152, + "grad_norm": 1.045682430267334, + "learning_rate": 0.0002, + "loss": 0.3838, + "step": 3380 + }, + { + "epoch": 4.164619164619165, + "grad_norm": 0.6873571276664734, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 3390 + }, + { + "epoch": 4.176904176904177, + "grad_norm": 1.0227675437927246, + "learning_rate": 0.0002, + "loss": 0.3942, + "step": 3400 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.9167711734771729, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 3410 + }, + { + "epoch": 4.201474201474202, + "grad_norm": 1.0598796606063843, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 3420 + }, + { + "epoch": 4.2137592137592135, + "grad_norm": 0.8581843972206116, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3430 + }, + { + "epoch": 4.226044226044226, + "grad_norm": 0.8862360119819641, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3440 + }, + { + "epoch": 4.238329238329238, + "grad_norm": 1.0248323678970337, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 3450 + }, + { + "epoch": 4.250614250614251, + "grad_norm": 0.8746261596679688, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3460 + }, + { + "epoch": 4.262899262899263, + "grad_norm": 0.7442536354064941, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 3470 + }, + { + "epoch": 4.275184275184275, + "grad_norm": 0.8295119404792786, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3480 + }, + { + "epoch": 4.287469287469287, + "grad_norm": 1.0634245872497559, + "learning_rate": 0.0002, + "loss": 0.3895, + "step": 3490 + }, + { + "epoch": 4.2997542997543, + "grad_norm": 0.9554621577262878, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 3500 + }, + { + "epoch": 4.312039312039312, + "grad_norm": 1.0191723108291626, + "learning_rate": 0.0002, + "loss": 0.3826, + "step": 3510 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.8573611378669739, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 3520 + }, + { + "epoch": 4.336609336609337, + "grad_norm": 0.9082390069961548, + "learning_rate": 0.0002, + "loss": 0.3869, + "step": 3530 + }, + { + "epoch": 4.348894348894349, + "grad_norm": 0.8650212287902832, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3540 + }, + { + "epoch": 4.361179361179361, + "grad_norm": 0.7186297178268433, + "learning_rate": 0.0002, + "loss": 0.3915, + "step": 3550 + }, + { + "epoch": 4.3734643734643734, + "grad_norm": 0.9750986695289612, + "learning_rate": 0.0002, + "loss": 0.3861, + "step": 3560 + }, + { + "epoch": 4.385749385749386, + "grad_norm": 1.0710467100143433, + "learning_rate": 0.0002, + "loss": 0.3967, + "step": 3570 + }, + { + "epoch": 4.398034398034398, + "grad_norm": 0.7974869012832642, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3580 + }, + { + "epoch": 4.41031941031941, + "grad_norm": 0.9405913949012756, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3590 + }, + { + "epoch": 4.422604422604422, + "grad_norm": 0.9393602609634399, + "learning_rate": 0.0002, + "loss": 0.3982, + "step": 3600 + }, + { + "epoch": 4.434889434889435, + "grad_norm": 1.0798007249832153, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 3610 + }, + { + "epoch": 4.447174447174447, + "grad_norm": 0.9226186275482178, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 3620 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 1.1046524047851562, + "learning_rate": 0.0002, + "loss": 0.3742, + "step": 3630 + }, + { + "epoch": 4.471744471744472, + "grad_norm": 0.8848567605018616, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 3640 + }, + { + "epoch": 4.484029484029484, + "grad_norm": 0.8913224339485168, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 3650 + }, + { + "epoch": 4.496314496314496, + "grad_norm": 0.8497583270072937, + "learning_rate": 0.0002, + "loss": 0.3731, + "step": 3660 + }, + { + "epoch": 4.5085995085995085, + "grad_norm": 0.8263831734657288, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 3670 + }, + { + "epoch": 4.520884520884521, + "grad_norm": 0.8470269441604614, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 3680 + }, + { + "epoch": 4.533169533169533, + "grad_norm": 0.860038161277771, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 3690 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.8898552656173706, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3700 + }, + { + "epoch": 4.557739557739557, + "grad_norm": 0.8152070641517639, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 3710 + }, + { + "epoch": 4.57002457002457, + "grad_norm": 0.7847675085067749, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 3720 + }, + { + "epoch": 4.582309582309582, + "grad_norm": 0.9625533819198608, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 3730 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.9097456336021423, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 3740 + }, + { + "epoch": 4.606879606879607, + "grad_norm": 0.871329128742218, + "learning_rate": 0.0002, + "loss": 0.3673, + "step": 3750 + }, + { + "epoch": 4.61916461916462, + "grad_norm": 0.9879975914955139, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 3760 + }, + { + "epoch": 4.631449631449631, + "grad_norm": 0.8636731505393982, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 3770 + }, + { + "epoch": 4.643734643734644, + "grad_norm": 1.0488964319229126, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 3780 + }, + { + "epoch": 4.656019656019656, + "grad_norm": 0.7637056112289429, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 3790 + }, + { + "epoch": 4.6683046683046685, + "grad_norm": 0.8507546186447144, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 3800 + }, + { + "epoch": 4.680589680589681, + "grad_norm": 1.0216856002807617, + "learning_rate": 0.0002, + "loss": 0.3852, + "step": 3810 + }, + { + "epoch": 4.6928746928746925, + "grad_norm": 1.026343822479248, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3820 + }, + { + "epoch": 4.705159705159705, + "grad_norm": 0.8311620950698853, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 3830 + }, + { + "epoch": 4.717444717444717, + "grad_norm": 0.7770653367042542, + "learning_rate": 0.0002, + "loss": 0.3771, + "step": 3840 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.7616215348243713, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3850 + }, + { + "epoch": 4.742014742014742, + "grad_norm": 1.0377072095870972, + "learning_rate": 0.0002, + "loss": 0.3927, + "step": 3860 + }, + { + "epoch": 4.754299754299755, + "grad_norm": 0.9713505506515503, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 3870 + }, + { + "epoch": 4.766584766584766, + "grad_norm": 0.8803321719169617, + "learning_rate": 0.0002, + "loss": 0.3722, + "step": 3880 + }, + { + "epoch": 4.778869778869779, + "grad_norm": 0.885535478591919, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 3890 + }, + { + "epoch": 4.791154791154791, + "grad_norm": 1.0877983570098877, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 3900 + }, + { + "epoch": 4.803439803439804, + "grad_norm": 0.7875366806983948, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3910 + }, + { + "epoch": 4.815724815724816, + "grad_norm": 0.8550102114677429, + "learning_rate": 0.0002, + "loss": 0.3591, + "step": 3920 + }, + { + "epoch": 4.828009828009828, + "grad_norm": 1.0217846632003784, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 3930 + }, + { + "epoch": 4.84029484029484, + "grad_norm": 0.7315713167190552, + "learning_rate": 0.0002, + "loss": 0.3649, + "step": 3940 + }, + { + "epoch": 4.8525798525798525, + "grad_norm": 0.8924923539161682, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 3950 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.9730218052864075, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 3960 + }, + { + "epoch": 4.877149877149877, + "grad_norm": 0.9202003479003906, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 3970 + }, + { + "epoch": 4.88943488943489, + "grad_norm": 0.8173081874847412, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 3980 + }, + { + "epoch": 4.901719901719901, + "grad_norm": 0.7178564667701721, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 3990 + }, + { + "epoch": 4.914004914004914, + "grad_norm": 0.913684606552124, + "learning_rate": 0.0002, + "loss": 0.3768, + "step": 4000 + }, + { + "epoch": 4.926289926289926, + "grad_norm": 0.8817896842956543, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 4010 + }, + { + "epoch": 4.938574938574939, + "grad_norm": 0.7652186751365662, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 4020 + }, + { + "epoch": 4.950859950859951, + "grad_norm": 0.8828630447387695, + "learning_rate": 0.0002, + "loss": 0.3699, + "step": 4030 + }, + { + "epoch": 4.963144963144963, + "grad_norm": 1.0878605842590332, + "learning_rate": 0.0002, + "loss": 0.3672, + "step": 4040 + }, + { + "epoch": 4.975429975429975, + "grad_norm": 1.0845288038253784, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 4050 + }, + { + "epoch": 4.987714987714988, + "grad_norm": 0.8431115746498108, + "learning_rate": 0.0002, + "loss": 0.365, + "step": 4060 + }, + { + "epoch": 5.0, + "grad_norm": 0.8320387601852417, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4070 + }, + { + "epoch": 5.0, + "eval_loss": 0.4017423093318939, + "eval_runtime": 20.8466, + "eval_samples_per_second": 15.878, + "eval_steps_per_second": 2.015, + "step": 4070 + }, + { + "epoch": 5.012285012285012, + "grad_norm": 0.8639023900032043, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 4080 + }, + { + "epoch": 5.024570024570025, + "grad_norm": 0.7123713493347168, + "learning_rate": 0.0002, + "loss": 0.3458, + "step": 4090 + }, + { + "epoch": 5.036855036855036, + "grad_norm": 0.9886922836303711, + "learning_rate": 0.0002, + "loss": 0.3404, + "step": 4100 + }, + { + "epoch": 5.049140049140049, + "grad_norm": 0.7880306243896484, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 4110 + }, + { + "epoch": 5.061425061425061, + "grad_norm": 0.7488741874694824, + "learning_rate": 0.0002, + "loss": 0.3406, + "step": 4120 + }, + { + "epoch": 5.073710073710074, + "grad_norm": 0.9359086751937866, + "learning_rate": 0.0002, + "loss": 0.3542, + "step": 4130 + }, + { + "epoch": 5.085995085995086, + "grad_norm": 0.9401527047157288, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 4140 + }, + { + "epoch": 5.098280098280099, + "grad_norm": 0.8396275043487549, + "learning_rate": 0.0002, + "loss": 0.3566, + "step": 4150 + }, + { + "epoch": 5.11056511056511, + "grad_norm": 0.7132664918899536, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4160 + }, + { + "epoch": 5.122850122850123, + "grad_norm": 0.843708872795105, + "learning_rate": 0.0002, + "loss": 0.3457, + "step": 4170 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.8733304738998413, + "learning_rate": 0.0002, + "loss": 0.3399, + "step": 4180 + }, + { + "epoch": 5.1474201474201475, + "grad_norm": 0.9064375162124634, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4190 + }, + { + "epoch": 5.15970515970516, + "grad_norm": 0.900770902633667, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4200 + }, + { + "epoch": 5.171990171990172, + "grad_norm": 0.863853394985199, + "learning_rate": 0.0002, + "loss": 0.3475, + "step": 4210 + }, + { + "epoch": 5.184275184275184, + "grad_norm": 0.767134964466095, + "learning_rate": 0.0002, + "loss": 0.3497, + "step": 4220 + }, + { + "epoch": 5.196560196560196, + "grad_norm": 0.7518735527992249, + "learning_rate": 0.0002, + "loss": 0.3527, + "step": 4230 + }, + { + "epoch": 5.208845208845209, + "grad_norm": 0.8040947914123535, + "learning_rate": 0.0002, + "loss": 0.3369, + "step": 4240 + }, + { + "epoch": 5.221130221130221, + "grad_norm": 0.7827144265174866, + "learning_rate": 0.0002, + "loss": 0.3496, + "step": 4250 + }, + { + "epoch": 5.233415233415234, + "grad_norm": 0.7306333184242249, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4260 + }, + { + "epoch": 5.245700245700245, + "grad_norm": 1.0963380336761475, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4270 + }, + { + "epoch": 5.257985257985258, + "grad_norm": 0.8200454711914062, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4280 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 0.8666796684265137, + "learning_rate": 0.0002, + "loss": 0.3509, + "step": 4290 + }, + { + "epoch": 5.282555282555283, + "grad_norm": 0.7862894535064697, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 4300 + }, + { + "epoch": 5.294840294840295, + "grad_norm": 0.8163095712661743, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4310 + }, + { + "epoch": 5.3071253071253075, + "grad_norm": 0.8069050908088684, + "learning_rate": 0.0002, + "loss": 0.34, + "step": 4320 + }, + { + "epoch": 5.319410319410319, + "grad_norm": 0.7858486175537109, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 4330 + }, + { + "epoch": 5.3316953316953315, + "grad_norm": 0.950339674949646, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4340 + }, + { + "epoch": 5.343980343980344, + "grad_norm": 0.9056477546691895, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4350 + }, + { + "epoch": 5.356265356265356, + "grad_norm": 0.9619399905204773, + "learning_rate": 0.0002, + "loss": 0.3538, + "step": 4360 + }, + { + "epoch": 5.368550368550369, + "grad_norm": 0.9778652191162109, + "learning_rate": 0.0002, + "loss": 0.3455, + "step": 4370 + }, + { + "epoch": 5.38083538083538, + "grad_norm": 0.6919555068016052, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4380 + }, + { + "epoch": 5.393120393120393, + "grad_norm": 0.8121668696403503, + "learning_rate": 0.0002, + "loss": 0.3426, + "step": 4390 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.8481289148330688, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4400 + }, + { + "epoch": 5.417690417690418, + "grad_norm": 0.8727408647537231, + "learning_rate": 0.0002, + "loss": 0.345, + "step": 4410 + }, + { + "epoch": 5.42997542997543, + "grad_norm": 0.8920271396636963, + "learning_rate": 0.0002, + "loss": 0.3554, + "step": 4420 + }, + { + "epoch": 5.442260442260443, + "grad_norm": 0.7758749723434448, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4430 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.8847506642341614, + "learning_rate": 0.0002, + "loss": 0.3483, + "step": 4440 + }, + { + "epoch": 5.466830466830467, + "grad_norm": 0.9760470390319824, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 4450 + }, + { + "epoch": 5.479115479115479, + "grad_norm": 0.8940271139144897, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 4460 + }, + { + "epoch": 5.4914004914004915, + "grad_norm": 0.8668502569198608, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 4470 + }, + { + "epoch": 5.503685503685504, + "grad_norm": 0.9097439050674438, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4480 + }, + { + "epoch": 5.515970515970516, + "grad_norm": 0.8217208981513977, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 4490 + }, + { + "epoch": 5.528255528255528, + "grad_norm": 0.7853189706802368, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4500 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 1.1113477945327759, + "learning_rate": 0.0002, + "loss": 0.3479, + "step": 4510 + }, + { + "epoch": 5.552825552825553, + "grad_norm": 0.8637538552284241, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4520 + }, + { + "epoch": 5.565110565110565, + "grad_norm": 1.0230066776275635, + "learning_rate": 0.0002, + "loss": 0.3403, + "step": 4530 + }, + { + "epoch": 5.577395577395578, + "grad_norm": 0.8972793817520142, + "learning_rate": 0.0002, + "loss": 0.3588, + "step": 4540 + }, + { + "epoch": 5.58968058968059, + "grad_norm": 0.7950642704963684, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 4550 + }, + { + "epoch": 5.601965601965602, + "grad_norm": 1.113753318786621, + "learning_rate": 0.0002, + "loss": 0.3468, + "step": 4560 + }, + { + "epoch": 5.614250614250614, + "grad_norm": 0.7842669486999512, + "learning_rate": 0.0002, + "loss": 0.3354, + "step": 4570 + }, + { + "epoch": 5.6265356265356266, + "grad_norm": 0.9713512063026428, + "learning_rate": 0.0002, + "loss": 0.3419, + "step": 4580 + }, + { + "epoch": 5.638820638820639, + "grad_norm": 0.9451650977134705, + "learning_rate": 0.0002, + "loss": 0.3502, + "step": 4590 + }, + { + "epoch": 5.651105651105651, + "grad_norm": 1.055484414100647, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 4600 + }, + { + "epoch": 5.663390663390663, + "grad_norm": 0.8408507704734802, + "learning_rate": 0.0002, + "loss": 0.3436, + "step": 4610 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 1.0293926000595093, + "learning_rate": 0.0002, + "loss": 0.3619, + "step": 4620 + }, + { + "epoch": 5.687960687960688, + "grad_norm": 0.7198245525360107, + "learning_rate": 0.0002, + "loss": 0.3484, + "step": 4630 + }, + { + "epoch": 5.7002457002457, + "grad_norm": 0.7564466595649719, + "learning_rate": 0.0002, + "loss": 0.3563, + "step": 4640 + }, + { + "epoch": 5.712530712530713, + "grad_norm": 0.7980002760887146, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4650 + }, + { + "epoch": 5.724815724815725, + "grad_norm": 0.8685088753700256, + "learning_rate": 0.0002, + "loss": 0.3478, + "step": 4660 + }, + { + "epoch": 5.737100737100737, + "grad_norm": 0.8816949129104614, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 4670 + }, + { + "epoch": 5.749385749385749, + "grad_norm": 0.7154731750488281, + "learning_rate": 0.0002, + "loss": 0.3462, + "step": 4680 + }, + { + "epoch": 5.761670761670762, + "grad_norm": 0.9430679678916931, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 4690 + }, + { + "epoch": 5.773955773955774, + "grad_norm": 0.7640151381492615, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4700 + }, + { + "epoch": 5.7862407862407865, + "grad_norm": 1.0920690298080444, + "learning_rate": 0.0002, + "loss": 0.3444, + "step": 4710 + }, + { + "epoch": 5.798525798525798, + "grad_norm": 0.9362104535102844, + "learning_rate": 0.0002, + "loss": 0.3356, + "step": 4720 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.8392294645309448, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 4730 + }, + { + "epoch": 5.823095823095823, + "grad_norm": 0.9893582463264465, + "learning_rate": 0.0002, + "loss": 0.3488, + "step": 4740 + }, + { + "epoch": 5.835380835380835, + "grad_norm": 0.6985510587692261, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 4750 + }, + { + "epoch": 5.847665847665848, + "grad_norm": 0.8906862735748291, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 4760 + }, + { + "epoch": 5.85995085995086, + "grad_norm": 0.8036413192749023, + "learning_rate": 0.0002, + "loss": 0.3481, + "step": 4770 + }, + { + "epoch": 5.872235872235873, + "grad_norm": 0.9948155283927917, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 4780 + }, + { + "epoch": 5.884520884520884, + "grad_norm": 0.8618432283401489, + "learning_rate": 0.0002, + "loss": 0.3385, + "step": 4790 + }, + { + "epoch": 5.896805896805897, + "grad_norm": 1.0422909259796143, + "learning_rate": 0.0002, + "loss": 0.3302, + "step": 4800 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 1.1892569065093994, + "learning_rate": 0.0002, + "loss": 0.3448, + "step": 4810 + }, + { + "epoch": 5.921375921375922, + "grad_norm": 1.1459916830062866, + "learning_rate": 0.0002, + "loss": 0.3506, + "step": 4820 + }, + { + "epoch": 5.933660933660933, + "grad_norm": 1.056235909461975, + "learning_rate": 0.0002, + "loss": 0.3387, + "step": 4830 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 0.8517277240753174, + "learning_rate": 0.0002, + "loss": 0.344, + "step": 4840 + }, + { + "epoch": 5.958230958230958, + "grad_norm": 0.8153380751609802, + "learning_rate": 0.0002, + "loss": 0.3421, + "step": 4850 + }, + { + "epoch": 5.9705159705159705, + "grad_norm": 0.7907533049583435, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 4860 + }, + { + "epoch": 5.982800982800983, + "grad_norm": 0.8443069458007812, + "learning_rate": 0.0002, + "loss": 0.3337, + "step": 4870 + }, + { + "epoch": 5.995085995085995, + "grad_norm": 0.8711344003677368, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 4880 + }, + { + "epoch": 6.0, + "eval_loss": 0.3778059184551239, + "eval_runtime": 20.6858, + "eval_samples_per_second": 16.001, + "eval_steps_per_second": 2.03, + "step": 4884 + }, + { + "epoch": 6.007371007371007, + "grad_norm": 0.7697948813438416, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 4890 + }, + { + "epoch": 6.019656019656019, + "grad_norm": 0.7734108567237854, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4900 + }, + { + "epoch": 6.031941031941032, + "grad_norm": 0.7173922657966614, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 4910 + }, + { + "epoch": 6.044226044226044, + "grad_norm": 1.062118649482727, + "learning_rate": 0.0002, + "loss": 0.3159, + "step": 4920 + }, + { + "epoch": 6.056511056511057, + "grad_norm": 0.746422529220581, + "learning_rate": 0.0002, + "loss": 0.3361, + "step": 4930 + }, + { + "epoch": 6.068796068796069, + "grad_norm": 0.8549448251724243, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 4940 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 0.9405432939529419, + "learning_rate": 0.0002, + "loss": 0.3236, + "step": 4950 + }, + { + "epoch": 6.093366093366093, + "grad_norm": 0.752382755279541, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 4960 + }, + { + "epoch": 6.105651105651106, + "grad_norm": 0.820332407951355, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 4970 + }, + { + "epoch": 6.117936117936118, + "grad_norm": 0.8701449036598206, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 4980 + }, + { + "epoch": 6.1302211302211305, + "grad_norm": 0.8192865252494812, + "learning_rate": 0.0002, + "loss": 0.321, + "step": 4990 + }, + { + "epoch": 6.142506142506143, + "grad_norm": 1.0016303062438965, + "learning_rate": 0.0002, + "loss": 0.3295, + "step": 5000 + }, + { + "epoch": 6.1547911547911545, + "grad_norm": 0.9194409251213074, + "learning_rate": 0.0002, + "loss": 0.3352, + "step": 5010 + }, + { + "epoch": 6.167076167076167, + "grad_norm": 0.9319757223129272, + "learning_rate": 0.0002, + "loss": 0.3205, + "step": 5020 + }, + { + "epoch": 6.179361179361179, + "grad_norm": 0.8737656474113464, + "learning_rate": 0.0002, + "loss": 0.3256, + "step": 5030 + }, + { + "epoch": 6.191646191646192, + "grad_norm": 0.8736537098884583, + "learning_rate": 0.0002, + "loss": 0.3221, + "step": 5040 + }, + { + "epoch": 6.203931203931204, + "grad_norm": 0.9301430583000183, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 5050 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.7717130780220032, + "learning_rate": 0.0002, + "loss": 0.3285, + "step": 5060 + }, + { + "epoch": 6.228501228501228, + "grad_norm": 0.6709604859352112, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 5070 + }, + { + "epoch": 6.240786240786241, + "grad_norm": 0.879374086856842, + "learning_rate": 0.0002, + "loss": 0.3352, + "step": 5080 + }, + { + "epoch": 6.253071253071253, + "grad_norm": 0.9136955738067627, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 5090 + }, + { + "epoch": 6.2653562653562656, + "grad_norm": 0.795177161693573, + "learning_rate": 0.0002, + "loss": 0.3228, + "step": 5100 + }, + { + "epoch": 6.277641277641278, + "grad_norm": 1.0412259101867676, + "learning_rate": 0.0002, + "loss": 0.3273, + "step": 5110 + }, + { + "epoch": 6.2899262899262895, + "grad_norm": 0.7382524013519287, + "learning_rate": 0.0002, + "loss": 0.3221, + "step": 5120 + }, + { + "epoch": 6.302211302211302, + "grad_norm": 0.8818480968475342, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 5130 + }, + { + "epoch": 6.314496314496314, + "grad_norm": 0.7865153551101685, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 5140 + }, + { + "epoch": 6.326781326781327, + "grad_norm": 0.9166486859321594, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 5150 + }, + { + "epoch": 6.339066339066339, + "grad_norm": 0.6655149459838867, + "learning_rate": 0.0002, + "loss": 0.33, + "step": 5160 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 0.7762818336486816, + "learning_rate": 0.0002, + "loss": 0.3359, + "step": 5170 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.8057235479354858, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 5180 + }, + { + "epoch": 6.375921375921376, + "grad_norm": 0.8186984062194824, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 5190 + }, + { + "epoch": 6.388206388206388, + "grad_norm": 0.8669573068618774, + "learning_rate": 0.0002, + "loss": 0.3289, + "step": 5200 + }, + { + "epoch": 6.400491400491401, + "grad_norm": 0.8904402852058411, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 5210 + }, + { + "epoch": 6.412776412776413, + "grad_norm": 0.9250359535217285, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 5220 + }, + { + "epoch": 6.4250614250614255, + "grad_norm": 0.8718299269676208, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 5230 + }, + { + "epoch": 6.437346437346437, + "grad_norm": 0.8156430125236511, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 5240 + }, + { + "epoch": 6.4496314496314495, + "grad_norm": 0.7759218215942383, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 5250 + }, + { + "epoch": 6.461916461916462, + "grad_norm": 0.8137310743331909, + "learning_rate": 0.0002, + "loss": 0.3298, + "step": 5260 + }, + { + "epoch": 6.474201474201474, + "grad_norm": 0.8121917843818665, + "learning_rate": 0.0002, + "loss": 0.3275, + "step": 5270 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 0.8178010582923889, + "learning_rate": 0.0002, + "loss": 0.3201, + "step": 5280 + }, + { + "epoch": 6.498771498771498, + "grad_norm": 1.1806302070617676, + "learning_rate": 0.0002, + "loss": 0.3271, + "step": 5290 + }, + { + "epoch": 6.511056511056511, + "grad_norm": 0.8255127668380737, + "learning_rate": 0.0002, + "loss": 0.3231, + "step": 5300 + }, + { + "epoch": 6.523341523341523, + "grad_norm": 0.8006690740585327, + "learning_rate": 0.0002, + "loss": 0.3227, + "step": 5310 + }, + { + "epoch": 6.535626535626536, + "grad_norm": 0.9932374358177185, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 5320 + }, + { + "epoch": 6.547911547911548, + "grad_norm": 0.8973969221115112, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 5330 + }, + { + "epoch": 6.560196560196561, + "grad_norm": 0.7359915971755981, + "learning_rate": 0.0002, + "loss": 0.3146, + "step": 5340 + }, + { + "epoch": 6.572481572481572, + "grad_norm": 0.9941133856773376, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 5350 + }, + { + "epoch": 6.584766584766585, + "grad_norm": 0.9008874893188477, + "learning_rate": 0.0002, + "loss": 0.3202, + "step": 5360 + }, + { + "epoch": 6.597051597051597, + "grad_norm": 1.309710144996643, + "learning_rate": 0.0002, + "loss": 0.3271, + "step": 5370 + }, + { + "epoch": 6.6093366093366095, + "grad_norm": 0.797768235206604, + "learning_rate": 0.0002, + "loss": 0.3177, + "step": 5380 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 0.8507353663444519, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5390 + }, + { + "epoch": 6.6339066339066335, + "grad_norm": 0.9628674983978271, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 5400 + }, + { + "epoch": 6.646191646191646, + "grad_norm": 0.6989983320236206, + "learning_rate": 0.0002, + "loss": 0.3155, + "step": 5410 + }, + { + "epoch": 6.658476658476658, + "grad_norm": 0.9505863189697266, + "learning_rate": 0.0002, + "loss": 0.3197, + "step": 5420 + }, + { + "epoch": 6.670761670761671, + "grad_norm": 0.8058171272277832, + "learning_rate": 0.0002, + "loss": 0.3259, + "step": 5430 + }, + { + "epoch": 6.683046683046683, + "grad_norm": 0.8476499915122986, + "learning_rate": 0.0002, + "loss": 0.3248, + "step": 5440 + }, + { + "epoch": 6.695331695331696, + "grad_norm": 0.8503309488296509, + "learning_rate": 0.0002, + "loss": 0.326, + "step": 5450 + }, + { + "epoch": 6.707616707616707, + "grad_norm": 0.919566810131073, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5460 + }, + { + "epoch": 6.71990171990172, + "grad_norm": 0.7741201519966125, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 5470 + }, + { + "epoch": 6.732186732186732, + "grad_norm": 0.8432701826095581, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 5480 + }, + { + "epoch": 6.744471744471745, + "grad_norm": 1.0183148384094238, + "learning_rate": 0.0002, + "loss": 0.3284, + "step": 5490 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 0.8491143584251404, + "learning_rate": 0.0002, + "loss": 0.3312, + "step": 5500 + }, + { + "epoch": 6.769041769041769, + "grad_norm": 0.9586310386657715, + "learning_rate": 0.0002, + "loss": 0.3208, + "step": 5510 + }, + { + "epoch": 6.781326781326781, + "grad_norm": 0.7936097383499146, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 5520 + }, + { + "epoch": 6.7936117936117935, + "grad_norm": 0.7875059247016907, + "learning_rate": 0.0002, + "loss": 0.318, + "step": 5530 + }, + { + "epoch": 6.805896805896806, + "grad_norm": 0.8136157393455505, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 5540 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.837213933467865, + "learning_rate": 0.0002, + "loss": 0.3161, + "step": 5550 + }, + { + "epoch": 6.830466830466831, + "grad_norm": 0.6812925338745117, + "learning_rate": 0.0002, + "loss": 0.3153, + "step": 5560 + }, + { + "epoch": 6.842751842751843, + "grad_norm": 0.7309592962265015, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 5570 + }, + { + "epoch": 6.855036855036855, + "grad_norm": 0.6905979514122009, + "learning_rate": 0.0002, + "loss": 0.3126, + "step": 5580 + }, + { + "epoch": 6.867321867321867, + "grad_norm": 1.1768406629562378, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 5590 + }, + { + "epoch": 6.87960687960688, + "grad_norm": 0.7618567943572998, + "learning_rate": 0.0002, + "loss": 0.3193, + "step": 5600 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 0.7930929660797119, + "learning_rate": 0.0002, + "loss": 0.3296, + "step": 5610 + }, + { + "epoch": 6.9041769041769046, + "grad_norm": 0.7931787371635437, + "learning_rate": 0.0002, + "loss": 0.3241, + "step": 5620 + }, + { + "epoch": 6.916461916461916, + "grad_norm": 0.6366972923278809, + "learning_rate": 0.0002, + "loss": 0.3215, + "step": 5630 + }, + { + "epoch": 6.9287469287469285, + "grad_norm": 0.7782737612724304, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 5640 + }, + { + "epoch": 6.941031941031941, + "grad_norm": 0.8643787503242493, + "learning_rate": 0.0002, + "loss": 0.3186, + "step": 5650 + }, + { + "epoch": 6.953316953316953, + "grad_norm": 1.0843733549118042, + "learning_rate": 0.0002, + "loss": 0.3285, + "step": 5660 + }, + { + "epoch": 6.965601965601966, + "grad_norm": 0.71319180727005, + "learning_rate": 0.0002, + "loss": 0.3163, + "step": 5670 + }, + { + "epoch": 6.977886977886978, + "grad_norm": 0.976536750793457, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 5680 + }, + { + "epoch": 6.99017199017199, + "grad_norm": 0.9221968054771423, + "learning_rate": 0.0002, + "loss": 0.3255, + "step": 5690 + }, + { + "epoch": 7.0, + "eval_loss": 0.3616626560688019, + "eval_runtime": 20.8747, + "eval_samples_per_second": 15.857, + "eval_steps_per_second": 2.012, + "step": 5698 + }, + { + "epoch": 7.002457002457002, + "grad_norm": 0.6302434802055359, + "learning_rate": 0.0002, + "loss": 0.3149, + "step": 5700 + }, + { + "epoch": 7.014742014742015, + "grad_norm": 0.7077583074569702, + "learning_rate": 0.0002, + "loss": 0.3017, + "step": 5710 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 0.7005309462547302, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 5720 + }, + { + "epoch": 7.03931203931204, + "grad_norm": 0.7724815607070923, + "learning_rate": 0.0002, + "loss": 0.3069, + "step": 5730 + }, + { + "epoch": 7.051597051597051, + "grad_norm": 0.6469350457191467, + "learning_rate": 0.0002, + "loss": 0.3002, + "step": 5740 + }, + { + "epoch": 7.063882063882064, + "grad_norm": 0.8406739234924316, + "learning_rate": 0.0002, + "loss": 0.3046, + "step": 5750 + }, + { + "epoch": 7.076167076167076, + "grad_norm": 0.9954310059547424, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 5760 + }, + { + "epoch": 7.0884520884520885, + "grad_norm": 0.7063487768173218, + "learning_rate": 0.0002, + "loss": 0.3076, + "step": 5770 + }, + { + "epoch": 7.100737100737101, + "grad_norm": 0.8696660995483398, + "learning_rate": 0.0002, + "loss": 0.3033, + "step": 5780 + }, + { + "epoch": 7.113022113022113, + "grad_norm": 0.8088991045951843, + "learning_rate": 0.0002, + "loss": 0.3049, + "step": 5790 + }, + { + "epoch": 7.125307125307125, + "grad_norm": 0.6934662461280823, + "learning_rate": 0.0002, + "loss": 0.3042, + "step": 5800 + }, + { + "epoch": 7.137592137592137, + "grad_norm": 0.7482573390007019, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 5810 + }, + { + "epoch": 7.14987714987715, + "grad_norm": 1.0848287343978882, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 5820 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 0.8017896413803101, + "learning_rate": 0.0002, + "loss": 0.3017, + "step": 5830 + }, + { + "epoch": 7.174447174447175, + "grad_norm": 0.6418949365615845, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 5840 + }, + { + "epoch": 7.186732186732186, + "grad_norm": 0.666072428226471, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 5850 + }, + { + "epoch": 7.199017199017199, + "grad_norm": 0.7549816370010376, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 5860 + }, + { + "epoch": 7.211302211302211, + "grad_norm": 0.8756735920906067, + "learning_rate": 0.0002, + "loss": 0.3088, + "step": 5870 + }, + { + "epoch": 7.223587223587224, + "grad_norm": 0.6790788769721985, + "learning_rate": 0.0002, + "loss": 0.3022, + "step": 5880 + }, + { + "epoch": 7.235872235872236, + "grad_norm": 0.8388362526893616, + "learning_rate": 0.0002, + "loss": 0.3011, + "step": 5890 + }, + { + "epoch": 7.2481572481572485, + "grad_norm": 0.8915345668792725, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 5900 + }, + { + "epoch": 7.26044226044226, + "grad_norm": 0.9234250783920288, + "learning_rate": 0.0002, + "loss": 0.3086, + "step": 5910 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.5452191233634949, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 5920 + }, + { + "epoch": 7.285012285012285, + "grad_norm": 0.6100478172302246, + "learning_rate": 0.0002, + "loss": 0.3054, + "step": 5930 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 0.6258270740509033, + "learning_rate": 0.0002, + "loss": 0.3073, + "step": 5940 + }, + { + "epoch": 7.30958230958231, + "grad_norm": 0.8540555834770203, + "learning_rate": 0.0002, + "loss": 0.3179, + "step": 5950 + }, + { + "epoch": 7.321867321867322, + "grad_norm": 0.8662564754486084, + "learning_rate": 0.0002, + "loss": 0.3109, + "step": 5960 + }, + { + "epoch": 7.334152334152334, + "grad_norm": 0.7404284477233887, + "learning_rate": 0.0002, + "loss": 0.3039, + "step": 5970 + }, + { + "epoch": 7.346437346437346, + "grad_norm": 0.7579419612884521, + "learning_rate": 0.0002, + "loss": 0.3036, + "step": 5980 + }, + { + "epoch": 7.358722358722359, + "grad_norm": 0.7248510122299194, + "learning_rate": 0.0002, + "loss": 0.3113, + "step": 5990 + }, + { + "epoch": 7.371007371007371, + "grad_norm": 0.8882181644439697, + "learning_rate": 0.0002, + "loss": 0.2987, + "step": 6000 + }, + { + "epoch": 7.383292383292384, + "grad_norm": 0.8494889736175537, + "learning_rate": 0.0002, + "loss": 0.3026, + "step": 6010 + }, + { + "epoch": 7.395577395577396, + "grad_norm": 0.8501948118209839, + "learning_rate": 0.0002, + "loss": 0.3147, + "step": 6020 + }, + { + "epoch": 7.407862407862408, + "grad_norm": 0.7228043079376221, + "learning_rate": 0.0002, + "loss": 0.3122, + "step": 6030 + }, + { + "epoch": 7.42014742014742, + "grad_norm": 0.7471523284912109, + "learning_rate": 0.0002, + "loss": 0.3092, + "step": 6040 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 0.810962975025177, + "learning_rate": 0.0002, + "loss": 0.307, + "step": 6050 + }, + { + "epoch": 7.444717444717445, + "grad_norm": 1.0621764659881592, + "learning_rate": 0.0002, + "loss": 0.3024, + "step": 6060 + }, + { + "epoch": 7.457002457002457, + "grad_norm": 0.72637939453125, + "learning_rate": 0.0002, + "loss": 0.3019, + "step": 6070 + }, + { + "epoch": 7.469287469287469, + "grad_norm": 1.1550157070159912, + "learning_rate": 0.0002, + "loss": 0.314, + "step": 6080 + }, + { + "epoch": 7.481572481572481, + "grad_norm": 0.865250825881958, + "learning_rate": 0.0002, + "loss": 0.3088, + "step": 6090 + }, + { + "epoch": 7.493857493857494, + "grad_norm": 0.8407077789306641, + "learning_rate": 0.0002, + "loss": 0.3041, + "step": 6100 + }, + { + "epoch": 7.506142506142506, + "grad_norm": 0.7295752167701721, + "learning_rate": 0.0002, + "loss": 0.3064, + "step": 6110 + }, + { + "epoch": 7.518427518427519, + "grad_norm": 0.9728897213935852, + "learning_rate": 0.0002, + "loss": 0.3061, + "step": 6120 + }, + { + "epoch": 7.530712530712531, + "grad_norm": 0.9776952862739563, + "learning_rate": 0.0002, + "loss": 0.3107, + "step": 6130 + }, + { + "epoch": 7.542997542997543, + "grad_norm": 0.704113245010376, + "learning_rate": 0.0002, + "loss": 0.3148, + "step": 6140 + }, + { + "epoch": 7.555282555282555, + "grad_norm": 0.9030590057373047, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 6150 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 0.6629155874252319, + "learning_rate": 0.0002, + "loss": 0.3134, + "step": 6160 + }, + { + "epoch": 7.57985257985258, + "grad_norm": 0.9348171353340149, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 6170 + }, + { + "epoch": 7.592137592137592, + "grad_norm": 0.9363399744033813, + "learning_rate": 0.0002, + "loss": 0.3065, + "step": 6180 + }, + { + "epoch": 7.604422604422604, + "grad_norm": 0.902718186378479, + "learning_rate": 0.0002, + "loss": 0.3099, + "step": 6190 + }, + { + "epoch": 7.616707616707616, + "grad_norm": 0.6992074251174927, + "learning_rate": 0.0002, + "loss": 0.3082, + "step": 6200 + }, + { + "epoch": 7.628992628992629, + "grad_norm": 0.7574757933616638, + "learning_rate": 0.0002, + "loss": 0.2941, + "step": 6210 + }, + { + "epoch": 7.641277641277641, + "grad_norm": 0.7717660069465637, + "learning_rate": 0.0002, + "loss": 0.3079, + "step": 6220 + }, + { + "epoch": 7.653562653562654, + "grad_norm": 0.7789981961250305, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 6230 + }, + { + "epoch": 7.665847665847666, + "grad_norm": 1.1020026206970215, + "learning_rate": 0.0002, + "loss": 0.3112, + "step": 6240 + }, + { + "epoch": 7.678132678132678, + "grad_norm": 0.7290350794792175, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 6250 + }, + { + "epoch": 7.69041769041769, + "grad_norm": 0.7291128039360046, + "learning_rate": 0.0002, + "loss": 0.3023, + "step": 6260 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 0.7766857147216797, + "learning_rate": 0.0002, + "loss": 0.3019, + "step": 6270 + }, + { + "epoch": 7.714987714987715, + "grad_norm": 0.938277542591095, + "learning_rate": 0.0002, + "loss": 0.3039, + "step": 6280 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 0.785190761089325, + "learning_rate": 0.0002, + "loss": 0.3103, + "step": 6290 + }, + { + "epoch": 7.739557739557739, + "grad_norm": 0.7140066623687744, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 6300 + }, + { + "epoch": 7.7518427518427515, + "grad_norm": 0.9476789236068726, + "learning_rate": 0.0002, + "loss": 0.3042, + "step": 6310 + }, + { + "epoch": 7.764127764127764, + "grad_norm": 0.6404930949211121, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 6320 + }, + { + "epoch": 7.776412776412776, + "grad_norm": 0.6433947682380676, + "learning_rate": 0.0002, + "loss": 0.3065, + "step": 6330 + }, + { + "epoch": 7.788697788697789, + "grad_norm": 0.8289583921432495, + "learning_rate": 0.0002, + "loss": 0.3117, + "step": 6340 + }, + { + "epoch": 7.800982800982801, + "grad_norm": 1.098555088043213, + "learning_rate": 0.0002, + "loss": 0.3057, + "step": 6350 + }, + { + "epoch": 7.813267813267814, + "grad_norm": 0.7225303053855896, + "learning_rate": 0.0002, + "loss": 0.3104, + "step": 6360 + }, + { + "epoch": 7.825552825552825, + "grad_norm": 0.845711886882782, + "learning_rate": 0.0002, + "loss": 0.3043, + "step": 6370 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 0.6199421882629395, + "learning_rate": 0.0002, + "loss": 0.3099, + "step": 6380 + }, + { + "epoch": 7.85012285012285, + "grad_norm": 0.7576995491981506, + "learning_rate": 0.0002, + "loss": 0.3095, + "step": 6390 + }, + { + "epoch": 7.862407862407863, + "grad_norm": 0.6669192314147949, + "learning_rate": 0.0002, + "loss": 0.311, + "step": 6400 + }, + { + "epoch": 7.874692874692875, + "grad_norm": 0.6896083354949951, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 6410 + }, + { + "epoch": 7.886977886977887, + "grad_norm": 0.9418429732322693, + "learning_rate": 0.0002, + "loss": 0.3, + "step": 6420 + }, + { + "epoch": 7.899262899262899, + "grad_norm": 0.7120184302330017, + "learning_rate": 0.0002, + "loss": 0.3074, + "step": 6430 + }, + { + "epoch": 7.9115479115479115, + "grad_norm": 0.7420004606246948, + "learning_rate": 0.0002, + "loss": 0.3158, + "step": 6440 + }, + { + "epoch": 7.923832923832924, + "grad_norm": 0.8989502191543579, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 6450 + }, + { + "epoch": 7.936117936117936, + "grad_norm": 0.715905487537384, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 6460 + }, + { + "epoch": 7.948402948402949, + "grad_norm": 0.8890138268470764, + "learning_rate": 0.0002, + "loss": 0.3018, + "step": 6470 + }, + { + "epoch": 7.96068796068796, + "grad_norm": 0.7992095351219177, + "learning_rate": 0.0002, + "loss": 0.3179, + "step": 6480 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 0.9169677495956421, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 6490 + }, + { + "epoch": 7.985257985257985, + "grad_norm": 0.7911704778671265, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 6500 + }, + { + "epoch": 7.997542997542998, + "grad_norm": 0.8787347078323364, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 6510 + }, + { + "epoch": 8.0, + "eval_loss": 0.3476468026638031, + "eval_runtime": 20.9232, + "eval_samples_per_second": 15.82, + "eval_steps_per_second": 2.007, + "step": 6512 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.958964013419725e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-6512/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/README.md b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/README.md new file mode 100644 index 0000000000000000000000000000000000000000..830a14f7db2734beb59f320973504e45a3fe87f5 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..600a5ae79fa5bbcdea8bd42ae99abf77134a3287 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_model.safetensors b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0557e6278d7e5434c1724923fd94e6ab372c2eb6 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b932857e7e169f54985be4155c82c5991b9ccf04f8b215a8cf1e7606fbe01d22 +size 29500848 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/optimizer.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9a5cbe56431906db3166b1329c624c9ab93850f --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75284599f62765ab7b0ddae65d53b349bdbc499903f800fbec384703f3e0a79 +size 15064314 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/rng_state.pth b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a291025dbb6b5c8f81d0bb7959bdee7783241af --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba23d88dd13bbc4ba62dd650687135927fc2f9b25562c44aeaad88e5492194f +size 14244 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/scheduler.pt b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d81af9d9349962f0edb5857b472dcfaebe70d51f --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eed6d80e68918f4258337b1cdd6e535070c3495c8e1af817b56814ff10e2a2a +size 1064 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/trainer_state.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..179da97e82ab42b6e59524481c9107242d96ae7c --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/trainer_state.json @@ -0,0 +1,608 @@ +{ + "best_metric": 0.703994870185852, + "best_model_checkpoint": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 814, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012285012285012284, + "grad_norm": 0.8178550004959106, + "learning_rate": 0.0002, + "loss": 3.5354, + "step": 10 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 1.0338047742843628, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 20 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 0.8931729197502136, + "learning_rate": 0.0002, + "loss": 2.1691, + "step": 30 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 0.9666458964347839, + "learning_rate": 0.0002, + "loss": 1.8813, + "step": 40 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.2691702842712402, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 50 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 1.0307111740112305, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 60 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 1.1837389469146729, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 70 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 1.1481467485427856, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 80 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 1.0385297536849976, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 90 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 1.125789999961853, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.9630613923072815, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 110 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 1.060392141342163, + "learning_rate": 0.0002, + "loss": 1.0074, + "step": 120 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 1.0986546277999878, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 130 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 1.1713459491729736, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 140 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 1.1548224687576294, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 150 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.2662502527236938, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 160 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.1521110534667969, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 170 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.1044857501983643, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 180 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.9770650267601013, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 190 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 0.9710931777954102, + "learning_rate": 0.0002, + "loss": 0.881, + "step": 200 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.9593933820724487, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 210 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.003553032875061, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 0.9187764525413513, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 230 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.9294946789741516, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.9537560939788818, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 250 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.00537109375, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 260 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.8775776028633118, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 270 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.8316839933395386, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 280 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.8542073965072632, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 290 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.848444402217865, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 300 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.9017520546913147, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 310 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.7672467231750488, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.9109916687011719, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 330 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8750321269035339, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 340 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.7911098599433899, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 350 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.871601402759552, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 360 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9393917918205261, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 370 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.8260403275489807, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 380 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.9792159199714661, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 390 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.9943315982818604, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 400 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.8999950885772705, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 410 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.8348393440246582, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 420 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.7371744513511658, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 430 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8354107141494751, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 440 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.8553793430328369, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 450 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.0762015581130981, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 460 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.8350747227668762, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 470 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.7819945216178894, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 480 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.8079741597175598, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 490 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.776435911655426, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 500 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.7646855115890503, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 510 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.786396861076355, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 520 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.7016594409942627, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 530 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.8060444593429565, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 540 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.9087467789649963, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 550 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.8149628639221191, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 560 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 570 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.7958765625953674, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 580 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.7917273640632629, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 590 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.8040468692779541, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 600 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.8696851134300232, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 610 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.8418059945106506, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 620 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.7754243612289429, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 630 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 0.7639613747596741, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.7516646385192871, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 650 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7840844988822937, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 660 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.7657070755958557, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 670 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.7711591720581055, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 680 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.8026325106620789, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 690 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.7902713418006897, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.8212456107139587, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 710 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.7867200970649719, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 720 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.80084627866745, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.7203794121742249, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 740 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.7598419785499573, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 750 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.7787027359008789, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 760 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8444012403488159, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 770 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.7388550639152527, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 780 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.7379167079925537, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 790 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.8291640281677246, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.7415094375610352, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 0.703994870185852, + "eval_runtime": 20.2182, + "eval_samples_per_second": 16.371, + "eval_steps_per_second": 2.077, + "step": 814 + } + ], + "logging_steps": 10, + "max_steps": 6512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9948705016774656.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/special_tokens_map.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.model b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer_config.json b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb4796c13b8d975555ecec45876ee75d1ae8b7 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_args.bin b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..73a75ebfc12aed51385aab437d91632ee4c20317 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809e8544b7de8b298d0b325fb6c98eb9f853fd72d7cbae286b6ee1541e6aee9 +size 5560 diff --git a/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_log.jsonl b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60b058e92ac7e4c2cd4dc10d82be965dee8bfda6 --- /dev/null +++ b/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 814, "epoch_duration": 749.2984616756439, "total_accumulated_duration": 749.2984616756439, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 5628.7490234375}, "avg_memory_reserved": {"GPU_0": 6182.0}, "peak_memory_reserved": {"GPU_0": 6182.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}]} +{"epoch": 2.0, "step": 1628, "epoch_duration": 737.6845755577087, "total_accumulated_duration": 1486.9830372333527, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-814", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}]} +{"epoch": 3.0, "step": 2442, "epoch_duration": 735.7253043651581, "total_accumulated_duration": 2222.7083415985107, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-1628", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}]} +{"epoch": 4.0, "step": 3256, "epoch_duration": 710.208240032196, "total_accumulated_duration": 2932.916581630707, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-2442", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}, {"eval_loss": 0.49752503633499146, "eval_runtime": 20.2911, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.07, "epoch": 3.0, "step": 2442}, {"loss": 0.4362, "grad_norm": 0.7644656896591187, "learning_rate": 0.0002, "epoch": 3.0098280098280097, "step": 2450}, {"loss": 0.4363, "grad_norm": 0.9077525734901428, "learning_rate": 0.0002, "epoch": 3.022113022113022, "step": 2460}, {"loss": 0.422, "grad_norm": 0.7859287261962891, "learning_rate": 0.0002, "epoch": 3.0343980343980346, "step": 2470}, {"loss": 0.4574, "grad_norm": 1.1200323104858398, "learning_rate": 0.0002, "epoch": 3.0466830466830466, "step": 2480}, {"loss": 0.4519, "grad_norm": 0.7570453882217407, "learning_rate": 0.0002, "epoch": 3.058968058968059, "step": 2490}, {"loss": 0.4351, "grad_norm": 0.9450915455818176, "learning_rate": 0.0002, "epoch": 3.0712530712530715, "step": 2500}, {"loss": 0.4343, "grad_norm": 0.8303545117378235, "learning_rate": 0.0002, "epoch": 3.0835380835380835, "step": 2510}, {"loss": 0.4308, "grad_norm": 0.8864443898200989, "learning_rate": 0.0002, "epoch": 3.095823095823096, "step": 2520}, {"loss": 0.4601, "grad_norm": 0.945324718952179, "learning_rate": 0.0002, "epoch": 3.108108108108108, "step": 2530}, {"loss": 0.4345, "grad_norm": 1.0562494993209839, "learning_rate": 0.0002, "epoch": 3.1203931203931203, "step": 2540}, {"loss": 0.4375, "grad_norm": 0.8607500195503235, "learning_rate": 0.0002, "epoch": 3.1326781326781328, "step": 2550}, {"loss": 0.456, "grad_norm": 0.8719640374183655, "learning_rate": 0.0002, "epoch": 3.1449631449631448, "step": 2560}, {"loss": 0.4469, "grad_norm": 0.8647059202194214, "learning_rate": 0.0002, "epoch": 3.157248157248157, "step": 2570}, {"loss": 0.4483, "grad_norm": 0.8346507549285889, "learning_rate": 0.0002, "epoch": 3.1695331695331697, "step": 2580}, {"loss": 0.4331, "grad_norm": 1.0208854675292969, "learning_rate": 0.0002, "epoch": 3.1818181818181817, "step": 2590}, {"loss": 0.435, "grad_norm": 0.7064385414123535, "learning_rate": 0.0002, "epoch": 3.194103194103194, "step": 2600}, {"loss": 0.4541, "grad_norm": 0.927347719669342, "learning_rate": 0.0002, "epoch": 3.2063882063882065, "step": 2610}, {"loss": 0.4561, "grad_norm": 0.943517804145813, "learning_rate": 0.0002, "epoch": 3.2186732186732185, "step": 2620}, {"loss": 0.4225, "grad_norm": 0.7837198376655579, "learning_rate": 0.0002, "epoch": 3.230958230958231, "step": 2630}, {"loss": 0.4494, "grad_norm": 0.7752765417098999, "learning_rate": 0.0002, "epoch": 3.2432432432432434, "step": 2640}, {"loss": 0.4468, "grad_norm": 0.8578953146934509, "learning_rate": 0.0002, "epoch": 3.2555282555282554, "step": 2650}, {"loss": 0.4393, "grad_norm": 1.0209529399871826, "learning_rate": 0.0002, "epoch": 3.267813267813268, "step": 2660}, {"loss": 0.4517, "grad_norm": 0.9069030284881592, "learning_rate": 0.0002, "epoch": 3.2800982800982803, "step": 2670}, {"loss": 0.4262, "grad_norm": 0.8454729318618774, "learning_rate": 0.0002, "epoch": 3.2923832923832923, "step": 2680}, {"loss": 0.4349, "grad_norm": 0.8253099322319031, "learning_rate": 0.0002, "epoch": 3.3046683046683047, "step": 2690}, {"loss": 0.4503, "grad_norm": 0.8765934109687805, "learning_rate": 0.0002, "epoch": 3.3169533169533167, "step": 2700}, {"loss": 0.4518, "grad_norm": 0.8149126172065735, "learning_rate": 0.0002, "epoch": 3.329238329238329, "step": 2710}, {"loss": 0.4437, "grad_norm": 0.8820102214813232, "learning_rate": 0.0002, "epoch": 3.3415233415233416, "step": 2720}, {"loss": 0.4346, "grad_norm": 0.8813952803611755, "learning_rate": 0.0002, "epoch": 3.3538083538083536, "step": 2730}, {"loss": 0.4396, "grad_norm": 1.0338447093963623, "learning_rate": 0.0002, "epoch": 3.366093366093366, "step": 2740}, {"loss": 0.4468, "grad_norm": 0.8780209422111511, "learning_rate": 0.0002, "epoch": 3.3783783783783785, "step": 2750}, {"loss": 0.441, "grad_norm": 0.9017151594161987, "learning_rate": 0.0002, "epoch": 3.3906633906633905, "step": 2760}, {"loss": 0.446, "grad_norm": 0.8647638559341431, "learning_rate": 0.0002, "epoch": 3.402948402948403, "step": 2770}, {"loss": 0.4131, "grad_norm": 0.8298183679580688, "learning_rate": 0.0002, "epoch": 3.4152334152334154, "step": 2780}, {"loss": 0.4406, "grad_norm": 0.9298108816146851, "learning_rate": 0.0002, "epoch": 3.4275184275184274, "step": 2790}, {"loss": 0.4145, "grad_norm": 0.8909980058670044, "learning_rate": 0.0002, "epoch": 3.43980343980344, "step": 2800}, {"loss": 0.4148, "grad_norm": 0.8027496933937073, "learning_rate": 0.0002, "epoch": 3.4520884520884523, "step": 2810}, {"loss": 0.4244, "grad_norm": 0.8766195774078369, "learning_rate": 0.0002, "epoch": 3.4643734643734643, "step": 2820}, {"loss": 0.4292, "grad_norm": 0.8194443583488464, "learning_rate": 0.0002, "epoch": 3.4766584766584767, "step": 2830}, {"loss": 0.4305, "grad_norm": 0.9862873554229736, "learning_rate": 0.0002, "epoch": 3.488943488943489, "step": 2840}, {"loss": 0.4393, "grad_norm": 0.8755377531051636, "learning_rate": 0.0002, "epoch": 3.501228501228501, "step": 2850}, {"loss": 0.4231, "grad_norm": 0.7300266027450562, "learning_rate": 0.0002, "epoch": 3.5135135135135136, "step": 2860}, {"loss": 0.4278, "grad_norm": 0.8342461585998535, "learning_rate": 0.0002, "epoch": 3.5257985257985256, "step": 2870}, {"loss": 0.4395, "grad_norm": 0.8624151349067688, "learning_rate": 0.0002, "epoch": 3.538083538083538, "step": 2880}, {"loss": 0.4064, "grad_norm": 0.8931261301040649, "learning_rate": 0.0002, "epoch": 3.5503685503685505, "step": 2890}, {"loss": 0.4358, "grad_norm": 0.8617086410522461, "learning_rate": 0.0002, "epoch": 3.562653562653563, "step": 2900}, {"loss": 0.419, "grad_norm": 0.8754099607467651, "learning_rate": 0.0002, "epoch": 3.574938574938575, "step": 2910}, {"loss": 0.4275, "grad_norm": 0.8345834612846375, "learning_rate": 0.0002, "epoch": 3.5872235872235874, "step": 2920}, {"loss": 0.4375, "grad_norm": 1.1414062976837158, "learning_rate": 0.0002, "epoch": 3.5995085995085994, "step": 2930}, {"loss": 0.4297, "grad_norm": 0.994860053062439, "learning_rate": 0.0002, "epoch": 3.611793611793612, "step": 2940}, {"loss": 0.4386, "grad_norm": 1.19268000125885, "learning_rate": 0.0002, "epoch": 3.6240786240786242, "step": 2950}, {"loss": 0.4029, "grad_norm": 0.8399543762207031, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 2960}, {"loss": 0.4432, "grad_norm": 0.9873217940330505, "learning_rate": 0.0002, "epoch": 3.6486486486486487, "step": 2970}, {"loss": 0.4308, "grad_norm": 0.9116013646125793, "learning_rate": 0.0002, "epoch": 3.6609336609336607, "step": 2980}, {"loss": 0.4275, "grad_norm": 0.9503833651542664, "learning_rate": 0.0002, "epoch": 3.673218673218673, "step": 2990}, {"loss": 0.4306, "grad_norm": 0.9401112794876099, "learning_rate": 0.0002, "epoch": 3.6855036855036856, "step": 3000}, {"loss": 0.4333, "grad_norm": 1.00745689868927, "learning_rate": 0.0002, "epoch": 3.697788697788698, "step": 3010}, {"loss": 0.432, "grad_norm": 1.0553191900253296, "learning_rate": 0.0002, "epoch": 3.71007371007371, "step": 3020}, {"loss": 0.4321, "grad_norm": 1.0226953029632568, "learning_rate": 0.0002, "epoch": 3.7223587223587224, "step": 3030}, {"loss": 0.418, "grad_norm": 1.085554838180542, "learning_rate": 0.0002, "epoch": 3.7346437346437344, "step": 3040}, {"loss": 0.4196, "grad_norm": 0.9948731064796448, "learning_rate": 0.0002, "epoch": 3.746928746928747, "step": 3050}, {"loss": 0.4281, "grad_norm": 0.9328727126121521, "learning_rate": 0.0002, "epoch": 3.7592137592137593, "step": 3060}, {"loss": 0.4284, "grad_norm": 1.0533266067504883, "learning_rate": 0.0002, "epoch": 3.7714987714987718, "step": 3070}, {"loss": 0.4414, "grad_norm": 0.8213809132575989, "learning_rate": 0.0002, "epoch": 3.7837837837837838, "step": 3080}, {"loss": 0.4348, "grad_norm": 0.8941594362258911, "learning_rate": 0.0002, "epoch": 3.796068796068796, "step": 3090}, {"loss": 0.4266, "grad_norm": 0.8324518203735352, "learning_rate": 0.0002, "epoch": 3.808353808353808, "step": 3100}, {"loss": 0.4227, "grad_norm": 0.8811233639717102, "learning_rate": 0.0002, "epoch": 3.8206388206388207, "step": 3110}, {"loss": 0.4195, "grad_norm": 0.8781470060348511, "learning_rate": 0.0002, "epoch": 3.832923832923833, "step": 3120}, {"loss": 0.4277, "grad_norm": 0.8994116187095642, "learning_rate": 0.0002, "epoch": 3.845208845208845, "step": 3130}, {"loss": 0.4149, "grad_norm": 0.8605017066001892, "learning_rate": 0.0002, "epoch": 3.8574938574938575, "step": 3140}, {"loss": 0.4023, "grad_norm": 0.8966400027275085, "learning_rate": 0.0002, "epoch": 3.8697788697788695, "step": 3150}, {"loss": 0.4245, "grad_norm": 0.8856554627418518, "learning_rate": 0.0002, "epoch": 3.882063882063882, "step": 3160}, {"loss": 0.4101, "grad_norm": 0.8971620798110962, "learning_rate": 0.0002, "epoch": 3.8943488943488944, "step": 3170}, {"loss": 0.3993, "grad_norm": 0.9807813167572021, "learning_rate": 0.0002, "epoch": 3.906633906633907, "step": 3180}, {"loss": 0.4258, "grad_norm": 0.8614121675491333, "learning_rate": 0.0002, "epoch": 3.918918918918919, "step": 3190}, {"loss": 0.4115, "grad_norm": 0.989171028137207, "learning_rate": 0.0002, "epoch": 3.9312039312039313, "step": 3200}, {"loss": 0.4182, "grad_norm": 0.8168872594833374, "learning_rate": 0.0002, "epoch": 3.9434889434889433, "step": 3210}, {"loss": 0.4112, "grad_norm": 0.8109386563301086, "learning_rate": 0.0002, "epoch": 3.9557739557739557, "step": 3220}, {"loss": 0.4165, "grad_norm": 1.0175853967666626, "learning_rate": 0.0002, "epoch": 3.968058968058968, "step": 3230}, {"loss": 0.4146, "grad_norm": 0.936143159866333, "learning_rate": 0.0002, "epoch": 3.98034398034398, "step": 3240}, {"loss": 0.4163, "grad_norm": 0.9557915925979614, "learning_rate": 0.0002, "epoch": 3.9926289926289926, "step": 3250}]} +{"epoch": 5.0, "step": 4070, "epoch_duration": 777.9073820114136, "total_accumulated_duration": 3710.8239636421204, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-3256", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}, {"eval_loss": 0.49752503633499146, "eval_runtime": 20.2911, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.07, "epoch": 3.0, "step": 2442}, {"loss": 0.4362, "grad_norm": 0.7644656896591187, "learning_rate": 0.0002, "epoch": 3.0098280098280097, "step": 2450}, {"loss": 0.4363, "grad_norm": 0.9077525734901428, "learning_rate": 0.0002, "epoch": 3.022113022113022, "step": 2460}, {"loss": 0.422, "grad_norm": 0.7859287261962891, "learning_rate": 0.0002, "epoch": 3.0343980343980346, "step": 2470}, {"loss": 0.4574, "grad_norm": 1.1200323104858398, "learning_rate": 0.0002, "epoch": 3.0466830466830466, "step": 2480}, {"loss": 0.4519, "grad_norm": 0.7570453882217407, "learning_rate": 0.0002, "epoch": 3.058968058968059, "step": 2490}, {"loss": 0.4351, "grad_norm": 0.9450915455818176, "learning_rate": 0.0002, "epoch": 3.0712530712530715, "step": 2500}, {"loss": 0.4343, "grad_norm": 0.8303545117378235, "learning_rate": 0.0002, "epoch": 3.0835380835380835, "step": 2510}, {"loss": 0.4308, "grad_norm": 0.8864443898200989, "learning_rate": 0.0002, "epoch": 3.095823095823096, "step": 2520}, {"loss": 0.4601, "grad_norm": 0.945324718952179, "learning_rate": 0.0002, "epoch": 3.108108108108108, "step": 2530}, {"loss": 0.4345, "grad_norm": 1.0562494993209839, "learning_rate": 0.0002, "epoch": 3.1203931203931203, "step": 2540}, {"loss": 0.4375, "grad_norm": 0.8607500195503235, "learning_rate": 0.0002, "epoch": 3.1326781326781328, "step": 2550}, {"loss": 0.456, "grad_norm": 0.8719640374183655, "learning_rate": 0.0002, "epoch": 3.1449631449631448, "step": 2560}, {"loss": 0.4469, "grad_norm": 0.8647059202194214, "learning_rate": 0.0002, "epoch": 3.157248157248157, "step": 2570}, {"loss": 0.4483, "grad_norm": 0.8346507549285889, "learning_rate": 0.0002, "epoch": 3.1695331695331697, "step": 2580}, {"loss": 0.4331, "grad_norm": 1.0208854675292969, "learning_rate": 0.0002, "epoch": 3.1818181818181817, "step": 2590}, {"loss": 0.435, "grad_norm": 0.7064385414123535, "learning_rate": 0.0002, "epoch": 3.194103194103194, "step": 2600}, {"loss": 0.4541, "grad_norm": 0.927347719669342, "learning_rate": 0.0002, "epoch": 3.2063882063882065, "step": 2610}, {"loss": 0.4561, "grad_norm": 0.943517804145813, "learning_rate": 0.0002, "epoch": 3.2186732186732185, "step": 2620}, {"loss": 0.4225, "grad_norm": 0.7837198376655579, "learning_rate": 0.0002, "epoch": 3.230958230958231, "step": 2630}, {"loss": 0.4494, "grad_norm": 0.7752765417098999, "learning_rate": 0.0002, "epoch": 3.2432432432432434, "step": 2640}, {"loss": 0.4468, "grad_norm": 0.8578953146934509, "learning_rate": 0.0002, "epoch": 3.2555282555282554, "step": 2650}, {"loss": 0.4393, "grad_norm": 1.0209529399871826, "learning_rate": 0.0002, "epoch": 3.267813267813268, "step": 2660}, {"loss": 0.4517, "grad_norm": 0.9069030284881592, "learning_rate": 0.0002, "epoch": 3.2800982800982803, "step": 2670}, {"loss": 0.4262, "grad_norm": 0.8454729318618774, "learning_rate": 0.0002, "epoch": 3.2923832923832923, "step": 2680}, {"loss": 0.4349, "grad_norm": 0.8253099322319031, "learning_rate": 0.0002, "epoch": 3.3046683046683047, "step": 2690}, {"loss": 0.4503, "grad_norm": 0.8765934109687805, "learning_rate": 0.0002, "epoch": 3.3169533169533167, "step": 2700}, {"loss": 0.4518, "grad_norm": 0.8149126172065735, "learning_rate": 0.0002, "epoch": 3.329238329238329, "step": 2710}, {"loss": 0.4437, "grad_norm": 0.8820102214813232, "learning_rate": 0.0002, "epoch": 3.3415233415233416, "step": 2720}, {"loss": 0.4346, "grad_norm": 0.8813952803611755, "learning_rate": 0.0002, "epoch": 3.3538083538083536, "step": 2730}, {"loss": 0.4396, "grad_norm": 1.0338447093963623, "learning_rate": 0.0002, "epoch": 3.366093366093366, "step": 2740}, {"loss": 0.4468, "grad_norm": 0.8780209422111511, "learning_rate": 0.0002, "epoch": 3.3783783783783785, "step": 2750}, {"loss": 0.441, "grad_norm": 0.9017151594161987, "learning_rate": 0.0002, "epoch": 3.3906633906633905, "step": 2760}, {"loss": 0.446, "grad_norm": 0.8647638559341431, "learning_rate": 0.0002, "epoch": 3.402948402948403, "step": 2770}, {"loss": 0.4131, "grad_norm": 0.8298183679580688, "learning_rate": 0.0002, "epoch": 3.4152334152334154, "step": 2780}, {"loss": 0.4406, "grad_norm": 0.9298108816146851, "learning_rate": 0.0002, "epoch": 3.4275184275184274, "step": 2790}, {"loss": 0.4145, "grad_norm": 0.8909980058670044, "learning_rate": 0.0002, "epoch": 3.43980343980344, "step": 2800}, {"loss": 0.4148, "grad_norm": 0.8027496933937073, "learning_rate": 0.0002, "epoch": 3.4520884520884523, "step": 2810}, {"loss": 0.4244, "grad_norm": 0.8766195774078369, "learning_rate": 0.0002, "epoch": 3.4643734643734643, "step": 2820}, {"loss": 0.4292, "grad_norm": 0.8194443583488464, "learning_rate": 0.0002, "epoch": 3.4766584766584767, "step": 2830}, {"loss": 0.4305, "grad_norm": 0.9862873554229736, "learning_rate": 0.0002, "epoch": 3.488943488943489, "step": 2840}, {"loss": 0.4393, "grad_norm": 0.8755377531051636, "learning_rate": 0.0002, "epoch": 3.501228501228501, "step": 2850}, {"loss": 0.4231, "grad_norm": 0.7300266027450562, "learning_rate": 0.0002, "epoch": 3.5135135135135136, "step": 2860}, {"loss": 0.4278, "grad_norm": 0.8342461585998535, "learning_rate": 0.0002, "epoch": 3.5257985257985256, "step": 2870}, {"loss": 0.4395, "grad_norm": 0.8624151349067688, "learning_rate": 0.0002, "epoch": 3.538083538083538, "step": 2880}, {"loss": 0.4064, "grad_norm": 0.8931261301040649, "learning_rate": 0.0002, "epoch": 3.5503685503685505, "step": 2890}, {"loss": 0.4358, "grad_norm": 0.8617086410522461, "learning_rate": 0.0002, "epoch": 3.562653562653563, "step": 2900}, {"loss": 0.419, "grad_norm": 0.8754099607467651, "learning_rate": 0.0002, "epoch": 3.574938574938575, "step": 2910}, {"loss": 0.4275, "grad_norm": 0.8345834612846375, "learning_rate": 0.0002, "epoch": 3.5872235872235874, "step": 2920}, {"loss": 0.4375, "grad_norm": 1.1414062976837158, "learning_rate": 0.0002, "epoch": 3.5995085995085994, "step": 2930}, {"loss": 0.4297, "grad_norm": 0.994860053062439, "learning_rate": 0.0002, "epoch": 3.611793611793612, "step": 2940}, {"loss": 0.4386, "grad_norm": 1.19268000125885, "learning_rate": 0.0002, "epoch": 3.6240786240786242, "step": 2950}, {"loss": 0.4029, "grad_norm": 0.8399543762207031, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 2960}, {"loss": 0.4432, "grad_norm": 0.9873217940330505, "learning_rate": 0.0002, "epoch": 3.6486486486486487, "step": 2970}, {"loss": 0.4308, "grad_norm": 0.9116013646125793, "learning_rate": 0.0002, "epoch": 3.6609336609336607, "step": 2980}, {"loss": 0.4275, "grad_norm": 0.9503833651542664, "learning_rate": 0.0002, "epoch": 3.673218673218673, "step": 2990}, {"loss": 0.4306, "grad_norm": 0.9401112794876099, "learning_rate": 0.0002, "epoch": 3.6855036855036856, "step": 3000}, {"loss": 0.4333, "grad_norm": 1.00745689868927, "learning_rate": 0.0002, "epoch": 3.697788697788698, "step": 3010}, {"loss": 0.432, "grad_norm": 1.0553191900253296, "learning_rate": 0.0002, "epoch": 3.71007371007371, "step": 3020}, {"loss": 0.4321, "grad_norm": 1.0226953029632568, "learning_rate": 0.0002, "epoch": 3.7223587223587224, "step": 3030}, {"loss": 0.418, "grad_norm": 1.085554838180542, "learning_rate": 0.0002, "epoch": 3.7346437346437344, "step": 3040}, {"loss": 0.4196, "grad_norm": 0.9948731064796448, "learning_rate": 0.0002, "epoch": 3.746928746928747, "step": 3050}, {"loss": 0.4281, "grad_norm": 0.9328727126121521, "learning_rate": 0.0002, "epoch": 3.7592137592137593, "step": 3060}, {"loss": 0.4284, "grad_norm": 1.0533266067504883, "learning_rate": 0.0002, "epoch": 3.7714987714987718, "step": 3070}, {"loss": 0.4414, "grad_norm": 0.8213809132575989, "learning_rate": 0.0002, "epoch": 3.7837837837837838, "step": 3080}, {"loss": 0.4348, "grad_norm": 0.8941594362258911, "learning_rate": 0.0002, "epoch": 3.796068796068796, "step": 3090}, {"loss": 0.4266, "grad_norm": 0.8324518203735352, "learning_rate": 0.0002, "epoch": 3.808353808353808, "step": 3100}, {"loss": 0.4227, "grad_norm": 0.8811233639717102, "learning_rate": 0.0002, "epoch": 3.8206388206388207, "step": 3110}, {"loss": 0.4195, "grad_norm": 0.8781470060348511, "learning_rate": 0.0002, "epoch": 3.832923832923833, "step": 3120}, {"loss": 0.4277, "grad_norm": 0.8994116187095642, "learning_rate": 0.0002, "epoch": 3.845208845208845, "step": 3130}, {"loss": 0.4149, "grad_norm": 0.8605017066001892, "learning_rate": 0.0002, "epoch": 3.8574938574938575, "step": 3140}, {"loss": 0.4023, "grad_norm": 0.8966400027275085, "learning_rate": 0.0002, "epoch": 3.8697788697788695, "step": 3150}, {"loss": 0.4245, "grad_norm": 0.8856554627418518, "learning_rate": 0.0002, "epoch": 3.882063882063882, "step": 3160}, {"loss": 0.4101, "grad_norm": 0.8971620798110962, "learning_rate": 0.0002, "epoch": 3.8943488943488944, "step": 3170}, {"loss": 0.3993, "grad_norm": 0.9807813167572021, "learning_rate": 0.0002, "epoch": 3.906633906633907, "step": 3180}, {"loss": 0.4258, "grad_norm": 0.8614121675491333, "learning_rate": 0.0002, "epoch": 3.918918918918919, "step": 3190}, {"loss": 0.4115, "grad_norm": 0.989171028137207, "learning_rate": 0.0002, "epoch": 3.9312039312039313, "step": 3200}, {"loss": 0.4182, "grad_norm": 0.8168872594833374, "learning_rate": 0.0002, "epoch": 3.9434889434889433, "step": 3210}, {"loss": 0.4112, "grad_norm": 0.8109386563301086, "learning_rate": 0.0002, "epoch": 3.9557739557739557, "step": 3220}, {"loss": 0.4165, "grad_norm": 1.0175853967666626, "learning_rate": 0.0002, "epoch": 3.968058968058968, "step": 3230}, {"loss": 0.4146, "grad_norm": 0.936143159866333, "learning_rate": 0.0002, "epoch": 3.98034398034398, "step": 3240}, {"loss": 0.4163, "grad_norm": 0.9557915925979614, "learning_rate": 0.0002, "epoch": 3.9926289926289926, "step": 3250}, {"eval_loss": 0.4401616156101227, "eval_runtime": 20.8047, "eval_samples_per_second": 15.91, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 3256}, {"loss": 0.408, "grad_norm": 0.7590614557266235, "learning_rate": 0.0002, "epoch": 4.004914004914005, "step": 3260}, {"loss": 0.4001, "grad_norm": 0.8920791149139404, "learning_rate": 0.0002, "epoch": 4.017199017199017, "step": 3270}, {"loss": 0.3789, "grad_norm": 0.8640421628952026, "learning_rate": 0.0002, "epoch": 4.0294840294840295, "step": 3280}, {"loss": 0.3791, "grad_norm": 0.9074113965034485, "learning_rate": 0.0002, "epoch": 4.041769041769042, "step": 3290}, {"loss": 0.3728, "grad_norm": 1.0600885152816772, "learning_rate": 0.0002, "epoch": 4.054054054054054, "step": 3300}, {"loss": 0.3857, "grad_norm": 0.9682773351669312, "learning_rate": 0.0002, "epoch": 4.066339066339066, "step": 3310}, {"loss": 0.4007, "grad_norm": 0.9326395392417908, "learning_rate": 0.0002, "epoch": 4.078624078624078, "step": 3320}, {"loss": 0.3823, "grad_norm": 0.8886597156524658, "learning_rate": 0.0002, "epoch": 4.090909090909091, "step": 3330}, {"loss": 0.3929, "grad_norm": 1.032205581665039, "learning_rate": 0.0002, "epoch": 4.103194103194103, "step": 3340}, {"loss": 0.3836, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.115479115479116, "step": 3350}, {"loss": 0.3866, "grad_norm": 0.8250347971916199, "learning_rate": 0.0002, "epoch": 4.127764127764128, "step": 3360}, {"loss": 0.3826, "grad_norm": 0.7919842600822449, "learning_rate": 0.0002, "epoch": 4.14004914004914, "step": 3370}, {"loss": 0.3838, "grad_norm": 1.045682430267334, "learning_rate": 0.0002, "epoch": 4.152334152334152, "step": 3380}, {"loss": 0.3796, "grad_norm": 0.6873571276664734, "learning_rate": 0.0002, "epoch": 4.164619164619165, "step": 3390}, {"loss": 0.3942, "grad_norm": 1.0227675437927246, "learning_rate": 0.0002, "epoch": 4.176904176904177, "step": 3400}, {"loss": 0.3788, "grad_norm": 0.9167711734771729, "learning_rate": 0.0002, "epoch": 4.1891891891891895, "step": 3410}, {"loss": 0.3792, "grad_norm": 1.0598796606063843, "learning_rate": 0.0002, "epoch": 4.201474201474202, "step": 3420}, {"loss": 0.3955, "grad_norm": 0.8581843972206116, "learning_rate": 0.0002, "epoch": 4.2137592137592135, "step": 3430}, {"loss": 0.3761, "grad_norm": 0.8862360119819641, "learning_rate": 0.0002, "epoch": 4.226044226044226, "step": 3440}, {"loss": 0.3889, "grad_norm": 1.0248323678970337, "learning_rate": 0.0002, "epoch": 4.238329238329238, "step": 3450}, {"loss": 0.3827, "grad_norm": 0.8746261596679688, "learning_rate": 0.0002, "epoch": 4.250614250614251, "step": 3460}, {"loss": 0.3949, "grad_norm": 0.7442536354064941, "learning_rate": 0.0002, "epoch": 4.262899262899263, "step": 3470}, {"loss": 0.3761, "grad_norm": 0.8295119404792786, "learning_rate": 0.0002, "epoch": 4.275184275184275, "step": 3480}, {"loss": 0.3895, "grad_norm": 1.0634245872497559, "learning_rate": 0.0002, "epoch": 4.287469287469287, "step": 3490}, {"loss": 0.3955, "grad_norm": 0.9554621577262878, "learning_rate": 0.0002, "epoch": 4.2997542997543, "step": 3500}, {"loss": 0.3826, "grad_norm": 1.0191723108291626, "learning_rate": 0.0002, "epoch": 4.312039312039312, "step": 3510}, {"loss": 0.3828, "grad_norm": 0.8573611378669739, "learning_rate": 0.0002, "epoch": 4.324324324324325, "step": 3520}, {"loss": 0.3869, "grad_norm": 0.9082390069961548, "learning_rate": 0.0002, "epoch": 4.336609336609337, "step": 3530}, {"loss": 0.3902, "grad_norm": 0.8650212287902832, "learning_rate": 0.0002, "epoch": 4.348894348894349, "step": 3540}, {"loss": 0.3915, "grad_norm": 0.7186297178268433, "learning_rate": 0.0002, "epoch": 4.361179361179361, "step": 3550}, {"loss": 0.3861, "grad_norm": 0.9750986695289612, "learning_rate": 0.0002, "epoch": 4.3734643734643734, "step": 3560}, {"loss": 0.3967, "grad_norm": 1.0710467100143433, "learning_rate": 0.0002, "epoch": 4.385749385749386, "step": 3570}, {"loss": 0.3774, "grad_norm": 0.7974869012832642, "learning_rate": 0.0002, "epoch": 4.398034398034398, "step": 3580}, {"loss": 0.3738, "grad_norm": 0.9405913949012756, "learning_rate": 0.0002, "epoch": 4.41031941031941, "step": 3590}, {"loss": 0.3982, "grad_norm": 0.9393602609634399, "learning_rate": 0.0002, "epoch": 4.422604422604422, "step": 3600}, {"loss": 0.3913, "grad_norm": 1.0798007249832153, "learning_rate": 0.0002, "epoch": 4.434889434889435, "step": 3610}, {"loss": 0.3682, "grad_norm": 0.9226186275482178, "learning_rate": 0.0002, "epoch": 4.447174447174447, "step": 3620}, {"loss": 0.3742, "grad_norm": 1.1046524047851562, "learning_rate": 0.0002, "epoch": 4.45945945945946, "step": 3630}, {"loss": 0.3886, "grad_norm": 0.8848567605018616, "learning_rate": 0.0002, "epoch": 4.471744471744472, "step": 3640}, {"loss": 0.3848, "grad_norm": 0.8913224339485168, "learning_rate": 0.0002, "epoch": 4.484029484029484, "step": 3650}, {"loss": 0.3731, "grad_norm": 0.8497583270072937, "learning_rate": 0.0002, "epoch": 4.496314496314496, "step": 3660}, {"loss": 0.3804, "grad_norm": 0.8263831734657288, "learning_rate": 0.0002, "epoch": 4.5085995085995085, "step": 3670}, {"loss": 0.3815, "grad_norm": 0.8470269441604614, "learning_rate": 0.0002, "epoch": 4.520884520884521, "step": 3680}, {"loss": 0.3774, "grad_norm": 0.860038161277771, "learning_rate": 0.0002, "epoch": 4.533169533169533, "step": 3690}, {"loss": 0.3817, "grad_norm": 0.8898552656173706, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 3700}, {"loss": 0.3776, "grad_norm": 0.8152070641517639, "learning_rate": 0.0002, "epoch": 4.557739557739557, "step": 3710}, {"loss": 0.383, "grad_norm": 0.7847675085067749, "learning_rate": 0.0002, "epoch": 4.57002457002457, "step": 3720}, {"loss": 0.3791, "grad_norm": 0.9625533819198608, "learning_rate": 0.0002, "epoch": 4.582309582309582, "step": 3730}, {"loss": 0.3699, "grad_norm": 0.9097456336021423, "learning_rate": 0.0002, "epoch": 4.594594594594595, "step": 3740}, {"loss": 0.3673, "grad_norm": 0.871329128742218, "learning_rate": 0.0002, "epoch": 4.606879606879607, "step": 3750}, {"loss": 0.3725, "grad_norm": 0.9879975914955139, "learning_rate": 0.0002, "epoch": 4.61916461916462, "step": 3760}, {"loss": 0.3827, "grad_norm": 0.8636731505393982, "learning_rate": 0.0002, "epoch": 4.631449631449631, "step": 3770}, {"loss": 0.3755, "grad_norm": 1.0488964319229126, "learning_rate": 0.0002, "epoch": 4.643734643734644, "step": 3780}, {"loss": 0.3738, "grad_norm": 0.7637056112289429, "learning_rate": 0.0002, "epoch": 4.656019656019656, "step": 3790}, {"loss": 0.3676, "grad_norm": 0.8507546186447144, "learning_rate": 0.0002, "epoch": 4.6683046683046685, "step": 3800}, {"loss": 0.3852, "grad_norm": 1.0216856002807617, "learning_rate": 0.0002, "epoch": 4.680589680589681, "step": 3810}, {"loss": 0.3751, "grad_norm": 1.026343822479248, "learning_rate": 0.0002, "epoch": 4.6928746928746925, "step": 3820}, {"loss": 0.3687, "grad_norm": 0.8311620950698853, "learning_rate": 0.0002, "epoch": 4.705159705159705, "step": 3830}, {"loss": 0.3771, "grad_norm": 0.7770653367042542, "learning_rate": 0.0002, "epoch": 4.717444717444717, "step": 3840}, {"loss": 0.37, "grad_norm": 0.7616215348243713, "learning_rate": 0.0002, "epoch": 4.72972972972973, "step": 3850}, {"loss": 0.3927, "grad_norm": 1.0377072095870972, "learning_rate": 0.0002, "epoch": 4.742014742014742, "step": 3860}, {"loss": 0.3832, "grad_norm": 0.9713505506515503, "learning_rate": 0.0002, "epoch": 4.754299754299755, "step": 3870}, {"loss": 0.3722, "grad_norm": 0.8803321719169617, "learning_rate": 0.0002, "epoch": 4.766584766584766, "step": 3880}, {"loss": 0.3756, "grad_norm": 0.885535478591919, "learning_rate": 0.0002, "epoch": 4.778869778869779, "step": 3890}, {"loss": 0.3714, "grad_norm": 1.0877983570098877, "learning_rate": 0.0002, "epoch": 4.791154791154791, "step": 3900}, {"loss": 0.3879, "grad_norm": 0.7875366806983948, "learning_rate": 0.0002, "epoch": 4.803439803439804, "step": 3910}, {"loss": 0.3591, "grad_norm": 0.8550102114677429, "learning_rate": 0.0002, "epoch": 4.815724815724816, "step": 3920}, {"loss": 0.3716, "grad_norm": 1.0217846632003784, "learning_rate": 0.0002, "epoch": 4.828009828009828, "step": 3930}, {"loss": 0.3649, "grad_norm": 0.7315713167190552, "learning_rate": 0.0002, "epoch": 4.84029484029484, "step": 3940}, {"loss": 0.3879, "grad_norm": 0.8924923539161682, "learning_rate": 0.0002, "epoch": 4.8525798525798525, "step": 3950}, {"loss": 0.3669, "grad_norm": 0.9730218052864075, "learning_rate": 0.0002, "epoch": 4.864864864864865, "step": 3960}, {"loss": 0.3705, "grad_norm": 0.9202003479003906, "learning_rate": 0.0002, "epoch": 4.877149877149877, "step": 3970}, {"loss": 0.3617, "grad_norm": 0.8173081874847412, "learning_rate": 0.0002, "epoch": 4.88943488943489, "step": 3980}, {"loss": 0.37, "grad_norm": 0.7178564667701721, "learning_rate": 0.0002, "epoch": 4.901719901719901, "step": 3990}, {"loss": 0.3768, "grad_norm": 0.913684606552124, "learning_rate": 0.0002, "epoch": 4.914004914004914, "step": 4000}, {"loss": 0.3755, "grad_norm": 0.8817896842956543, "learning_rate": 0.0002, "epoch": 4.926289926289926, "step": 4010}, {"loss": 0.3676, "grad_norm": 0.7652186751365662, "learning_rate": 0.0002, "epoch": 4.938574938574939, "step": 4020}, {"loss": 0.3699, "grad_norm": 0.8828630447387695, "learning_rate": 0.0002, "epoch": 4.950859950859951, "step": 4030}, {"loss": 0.3672, "grad_norm": 1.0878605842590332, "learning_rate": 0.0002, "epoch": 4.963144963144963, "step": 4040}, {"loss": 0.3656, "grad_norm": 1.0845288038253784, "learning_rate": 0.0002, "epoch": 4.975429975429975, "step": 4050}, {"loss": 0.365, "grad_norm": 0.8431115746498108, "learning_rate": 0.0002, "epoch": 4.987714987714988, "step": 4060}, {"loss": 0.3693, "grad_norm": 0.8320387601852417, "learning_rate": 0.0002, "epoch": 5.0, "step": 4070}]} +{"epoch": 6.0, "step": 4884, "epoch_duration": 781.9419949054718, "total_accumulated_duration": 4492.765958547592, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4070", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}, {"eval_loss": 0.49752503633499146, "eval_runtime": 20.2911, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.07, "epoch": 3.0, "step": 2442}, {"loss": 0.4362, "grad_norm": 0.7644656896591187, "learning_rate": 0.0002, "epoch": 3.0098280098280097, "step": 2450}, {"loss": 0.4363, "grad_norm": 0.9077525734901428, "learning_rate": 0.0002, "epoch": 3.022113022113022, "step": 2460}, {"loss": 0.422, "grad_norm": 0.7859287261962891, "learning_rate": 0.0002, "epoch": 3.0343980343980346, "step": 2470}, {"loss": 0.4574, "grad_norm": 1.1200323104858398, "learning_rate": 0.0002, "epoch": 3.0466830466830466, "step": 2480}, {"loss": 0.4519, "grad_norm": 0.7570453882217407, "learning_rate": 0.0002, "epoch": 3.058968058968059, "step": 2490}, {"loss": 0.4351, "grad_norm": 0.9450915455818176, "learning_rate": 0.0002, "epoch": 3.0712530712530715, "step": 2500}, {"loss": 0.4343, "grad_norm": 0.8303545117378235, "learning_rate": 0.0002, "epoch": 3.0835380835380835, "step": 2510}, {"loss": 0.4308, "grad_norm": 0.8864443898200989, "learning_rate": 0.0002, "epoch": 3.095823095823096, "step": 2520}, {"loss": 0.4601, "grad_norm": 0.945324718952179, "learning_rate": 0.0002, "epoch": 3.108108108108108, "step": 2530}, {"loss": 0.4345, "grad_norm": 1.0562494993209839, "learning_rate": 0.0002, "epoch": 3.1203931203931203, "step": 2540}, {"loss": 0.4375, "grad_norm": 0.8607500195503235, "learning_rate": 0.0002, "epoch": 3.1326781326781328, "step": 2550}, {"loss": 0.456, "grad_norm": 0.8719640374183655, "learning_rate": 0.0002, "epoch": 3.1449631449631448, "step": 2560}, {"loss": 0.4469, "grad_norm": 0.8647059202194214, "learning_rate": 0.0002, "epoch": 3.157248157248157, "step": 2570}, {"loss": 0.4483, "grad_norm": 0.8346507549285889, "learning_rate": 0.0002, "epoch": 3.1695331695331697, "step": 2580}, {"loss": 0.4331, "grad_norm": 1.0208854675292969, "learning_rate": 0.0002, "epoch": 3.1818181818181817, "step": 2590}, {"loss": 0.435, "grad_norm": 0.7064385414123535, "learning_rate": 0.0002, "epoch": 3.194103194103194, "step": 2600}, {"loss": 0.4541, "grad_norm": 0.927347719669342, "learning_rate": 0.0002, "epoch": 3.2063882063882065, "step": 2610}, {"loss": 0.4561, "grad_norm": 0.943517804145813, "learning_rate": 0.0002, "epoch": 3.2186732186732185, "step": 2620}, {"loss": 0.4225, "grad_norm": 0.7837198376655579, "learning_rate": 0.0002, "epoch": 3.230958230958231, "step": 2630}, {"loss": 0.4494, "grad_norm": 0.7752765417098999, "learning_rate": 0.0002, "epoch": 3.2432432432432434, "step": 2640}, {"loss": 0.4468, "grad_norm": 0.8578953146934509, "learning_rate": 0.0002, "epoch": 3.2555282555282554, "step": 2650}, {"loss": 0.4393, "grad_norm": 1.0209529399871826, "learning_rate": 0.0002, "epoch": 3.267813267813268, "step": 2660}, {"loss": 0.4517, "grad_norm": 0.9069030284881592, "learning_rate": 0.0002, "epoch": 3.2800982800982803, "step": 2670}, {"loss": 0.4262, "grad_norm": 0.8454729318618774, "learning_rate": 0.0002, "epoch": 3.2923832923832923, "step": 2680}, {"loss": 0.4349, "grad_norm": 0.8253099322319031, "learning_rate": 0.0002, "epoch": 3.3046683046683047, "step": 2690}, {"loss": 0.4503, "grad_norm": 0.8765934109687805, "learning_rate": 0.0002, "epoch": 3.3169533169533167, "step": 2700}, {"loss": 0.4518, "grad_norm": 0.8149126172065735, "learning_rate": 0.0002, "epoch": 3.329238329238329, "step": 2710}, {"loss": 0.4437, "grad_norm": 0.8820102214813232, "learning_rate": 0.0002, "epoch": 3.3415233415233416, "step": 2720}, {"loss": 0.4346, "grad_norm": 0.8813952803611755, "learning_rate": 0.0002, "epoch": 3.3538083538083536, "step": 2730}, {"loss": 0.4396, "grad_norm": 1.0338447093963623, "learning_rate": 0.0002, "epoch": 3.366093366093366, "step": 2740}, {"loss": 0.4468, "grad_norm": 0.8780209422111511, "learning_rate": 0.0002, "epoch": 3.3783783783783785, "step": 2750}, {"loss": 0.441, "grad_norm": 0.9017151594161987, "learning_rate": 0.0002, "epoch": 3.3906633906633905, "step": 2760}, {"loss": 0.446, "grad_norm": 0.8647638559341431, "learning_rate": 0.0002, "epoch": 3.402948402948403, "step": 2770}, {"loss": 0.4131, "grad_norm": 0.8298183679580688, "learning_rate": 0.0002, "epoch": 3.4152334152334154, "step": 2780}, {"loss": 0.4406, "grad_norm": 0.9298108816146851, "learning_rate": 0.0002, "epoch": 3.4275184275184274, "step": 2790}, {"loss": 0.4145, "grad_norm": 0.8909980058670044, "learning_rate": 0.0002, "epoch": 3.43980343980344, "step": 2800}, {"loss": 0.4148, "grad_norm": 0.8027496933937073, "learning_rate": 0.0002, "epoch": 3.4520884520884523, "step": 2810}, {"loss": 0.4244, "grad_norm": 0.8766195774078369, "learning_rate": 0.0002, "epoch": 3.4643734643734643, "step": 2820}, {"loss": 0.4292, "grad_norm": 0.8194443583488464, "learning_rate": 0.0002, "epoch": 3.4766584766584767, "step": 2830}, {"loss": 0.4305, "grad_norm": 0.9862873554229736, "learning_rate": 0.0002, "epoch": 3.488943488943489, "step": 2840}, {"loss": 0.4393, "grad_norm": 0.8755377531051636, "learning_rate": 0.0002, "epoch": 3.501228501228501, "step": 2850}, {"loss": 0.4231, "grad_norm": 0.7300266027450562, "learning_rate": 0.0002, "epoch": 3.5135135135135136, "step": 2860}, {"loss": 0.4278, "grad_norm": 0.8342461585998535, "learning_rate": 0.0002, "epoch": 3.5257985257985256, "step": 2870}, {"loss": 0.4395, "grad_norm": 0.8624151349067688, "learning_rate": 0.0002, "epoch": 3.538083538083538, "step": 2880}, {"loss": 0.4064, "grad_norm": 0.8931261301040649, "learning_rate": 0.0002, "epoch": 3.5503685503685505, "step": 2890}, {"loss": 0.4358, "grad_norm": 0.8617086410522461, "learning_rate": 0.0002, "epoch": 3.562653562653563, "step": 2900}, {"loss": 0.419, "grad_norm": 0.8754099607467651, "learning_rate": 0.0002, "epoch": 3.574938574938575, "step": 2910}, {"loss": 0.4275, "grad_norm": 0.8345834612846375, "learning_rate": 0.0002, "epoch": 3.5872235872235874, "step": 2920}, {"loss": 0.4375, "grad_norm": 1.1414062976837158, "learning_rate": 0.0002, "epoch": 3.5995085995085994, "step": 2930}, {"loss": 0.4297, "grad_norm": 0.994860053062439, "learning_rate": 0.0002, "epoch": 3.611793611793612, "step": 2940}, {"loss": 0.4386, "grad_norm": 1.19268000125885, "learning_rate": 0.0002, "epoch": 3.6240786240786242, "step": 2950}, {"loss": 0.4029, "grad_norm": 0.8399543762207031, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 2960}, {"loss": 0.4432, "grad_norm": 0.9873217940330505, "learning_rate": 0.0002, "epoch": 3.6486486486486487, "step": 2970}, {"loss": 0.4308, "grad_norm": 0.9116013646125793, "learning_rate": 0.0002, "epoch": 3.6609336609336607, "step": 2980}, {"loss": 0.4275, "grad_norm": 0.9503833651542664, "learning_rate": 0.0002, "epoch": 3.673218673218673, "step": 2990}, {"loss": 0.4306, "grad_norm": 0.9401112794876099, "learning_rate": 0.0002, "epoch": 3.6855036855036856, "step": 3000}, {"loss": 0.4333, "grad_norm": 1.00745689868927, "learning_rate": 0.0002, "epoch": 3.697788697788698, "step": 3010}, {"loss": 0.432, "grad_norm": 1.0553191900253296, "learning_rate": 0.0002, "epoch": 3.71007371007371, "step": 3020}, {"loss": 0.4321, "grad_norm": 1.0226953029632568, "learning_rate": 0.0002, "epoch": 3.7223587223587224, "step": 3030}, {"loss": 0.418, "grad_norm": 1.085554838180542, "learning_rate": 0.0002, "epoch": 3.7346437346437344, "step": 3040}, {"loss": 0.4196, "grad_norm": 0.9948731064796448, "learning_rate": 0.0002, "epoch": 3.746928746928747, "step": 3050}, {"loss": 0.4281, "grad_norm": 0.9328727126121521, "learning_rate": 0.0002, "epoch": 3.7592137592137593, "step": 3060}, {"loss": 0.4284, "grad_norm": 1.0533266067504883, "learning_rate": 0.0002, "epoch": 3.7714987714987718, "step": 3070}, {"loss": 0.4414, "grad_norm": 0.8213809132575989, "learning_rate": 0.0002, "epoch": 3.7837837837837838, "step": 3080}, {"loss": 0.4348, "grad_norm": 0.8941594362258911, "learning_rate": 0.0002, "epoch": 3.796068796068796, "step": 3090}, {"loss": 0.4266, "grad_norm": 0.8324518203735352, "learning_rate": 0.0002, "epoch": 3.808353808353808, "step": 3100}, {"loss": 0.4227, "grad_norm": 0.8811233639717102, "learning_rate": 0.0002, "epoch": 3.8206388206388207, "step": 3110}, {"loss": 0.4195, "grad_norm": 0.8781470060348511, "learning_rate": 0.0002, "epoch": 3.832923832923833, "step": 3120}, {"loss": 0.4277, "grad_norm": 0.8994116187095642, "learning_rate": 0.0002, "epoch": 3.845208845208845, "step": 3130}, {"loss": 0.4149, "grad_norm": 0.8605017066001892, "learning_rate": 0.0002, "epoch": 3.8574938574938575, "step": 3140}, {"loss": 0.4023, "grad_norm": 0.8966400027275085, "learning_rate": 0.0002, "epoch": 3.8697788697788695, "step": 3150}, {"loss": 0.4245, "grad_norm": 0.8856554627418518, "learning_rate": 0.0002, "epoch": 3.882063882063882, "step": 3160}, {"loss": 0.4101, "grad_norm": 0.8971620798110962, "learning_rate": 0.0002, "epoch": 3.8943488943488944, "step": 3170}, {"loss": 0.3993, "grad_norm": 0.9807813167572021, "learning_rate": 0.0002, "epoch": 3.906633906633907, "step": 3180}, {"loss": 0.4258, "grad_norm": 0.8614121675491333, "learning_rate": 0.0002, "epoch": 3.918918918918919, "step": 3190}, {"loss": 0.4115, "grad_norm": 0.989171028137207, "learning_rate": 0.0002, "epoch": 3.9312039312039313, "step": 3200}, {"loss": 0.4182, "grad_norm": 0.8168872594833374, "learning_rate": 0.0002, "epoch": 3.9434889434889433, "step": 3210}, {"loss": 0.4112, "grad_norm": 0.8109386563301086, "learning_rate": 0.0002, "epoch": 3.9557739557739557, "step": 3220}, {"loss": 0.4165, "grad_norm": 1.0175853967666626, "learning_rate": 0.0002, "epoch": 3.968058968058968, "step": 3230}, {"loss": 0.4146, "grad_norm": 0.936143159866333, "learning_rate": 0.0002, "epoch": 3.98034398034398, "step": 3240}, {"loss": 0.4163, "grad_norm": 0.9557915925979614, "learning_rate": 0.0002, "epoch": 3.9926289926289926, "step": 3250}, {"eval_loss": 0.4401616156101227, "eval_runtime": 20.8047, "eval_samples_per_second": 15.91, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 3256}, {"loss": 0.408, "grad_norm": 0.7590614557266235, "learning_rate": 0.0002, "epoch": 4.004914004914005, "step": 3260}, {"loss": 0.4001, "grad_norm": 0.8920791149139404, "learning_rate": 0.0002, "epoch": 4.017199017199017, "step": 3270}, {"loss": 0.3789, "grad_norm": 0.8640421628952026, "learning_rate": 0.0002, "epoch": 4.0294840294840295, "step": 3280}, {"loss": 0.3791, "grad_norm": 0.9074113965034485, "learning_rate": 0.0002, "epoch": 4.041769041769042, "step": 3290}, {"loss": 0.3728, "grad_norm": 1.0600885152816772, "learning_rate": 0.0002, "epoch": 4.054054054054054, "step": 3300}, {"loss": 0.3857, "grad_norm": 0.9682773351669312, "learning_rate": 0.0002, "epoch": 4.066339066339066, "step": 3310}, {"loss": 0.4007, "grad_norm": 0.9326395392417908, "learning_rate": 0.0002, "epoch": 4.078624078624078, "step": 3320}, {"loss": 0.3823, "grad_norm": 0.8886597156524658, "learning_rate": 0.0002, "epoch": 4.090909090909091, "step": 3330}, {"loss": 0.3929, "grad_norm": 1.032205581665039, "learning_rate": 0.0002, "epoch": 4.103194103194103, "step": 3340}, {"loss": 0.3836, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.115479115479116, "step": 3350}, {"loss": 0.3866, "grad_norm": 0.8250347971916199, "learning_rate": 0.0002, "epoch": 4.127764127764128, "step": 3360}, {"loss": 0.3826, "grad_norm": 0.7919842600822449, "learning_rate": 0.0002, "epoch": 4.14004914004914, "step": 3370}, {"loss": 0.3838, "grad_norm": 1.045682430267334, "learning_rate": 0.0002, "epoch": 4.152334152334152, "step": 3380}, {"loss": 0.3796, "grad_norm": 0.6873571276664734, "learning_rate": 0.0002, "epoch": 4.164619164619165, "step": 3390}, {"loss": 0.3942, "grad_norm": 1.0227675437927246, "learning_rate": 0.0002, "epoch": 4.176904176904177, "step": 3400}, {"loss": 0.3788, "grad_norm": 0.9167711734771729, "learning_rate": 0.0002, "epoch": 4.1891891891891895, "step": 3410}, {"loss": 0.3792, "grad_norm": 1.0598796606063843, "learning_rate": 0.0002, "epoch": 4.201474201474202, "step": 3420}, {"loss": 0.3955, "grad_norm": 0.8581843972206116, "learning_rate": 0.0002, "epoch": 4.2137592137592135, "step": 3430}, {"loss": 0.3761, "grad_norm": 0.8862360119819641, "learning_rate": 0.0002, "epoch": 4.226044226044226, "step": 3440}, {"loss": 0.3889, "grad_norm": 1.0248323678970337, "learning_rate": 0.0002, "epoch": 4.238329238329238, "step": 3450}, {"loss": 0.3827, "grad_norm": 0.8746261596679688, "learning_rate": 0.0002, "epoch": 4.250614250614251, "step": 3460}, {"loss": 0.3949, "grad_norm": 0.7442536354064941, "learning_rate": 0.0002, "epoch": 4.262899262899263, "step": 3470}, {"loss": 0.3761, "grad_norm": 0.8295119404792786, "learning_rate": 0.0002, "epoch": 4.275184275184275, "step": 3480}, {"loss": 0.3895, "grad_norm": 1.0634245872497559, "learning_rate": 0.0002, "epoch": 4.287469287469287, "step": 3490}, {"loss": 0.3955, "grad_norm": 0.9554621577262878, "learning_rate": 0.0002, "epoch": 4.2997542997543, "step": 3500}, {"loss": 0.3826, "grad_norm": 1.0191723108291626, "learning_rate": 0.0002, "epoch": 4.312039312039312, "step": 3510}, {"loss": 0.3828, "grad_norm": 0.8573611378669739, "learning_rate": 0.0002, "epoch": 4.324324324324325, "step": 3520}, {"loss": 0.3869, "grad_norm": 0.9082390069961548, "learning_rate": 0.0002, "epoch": 4.336609336609337, "step": 3530}, {"loss": 0.3902, "grad_norm": 0.8650212287902832, "learning_rate": 0.0002, "epoch": 4.348894348894349, "step": 3540}, {"loss": 0.3915, "grad_norm": 0.7186297178268433, "learning_rate": 0.0002, "epoch": 4.361179361179361, "step": 3550}, {"loss": 0.3861, "grad_norm": 0.9750986695289612, "learning_rate": 0.0002, "epoch": 4.3734643734643734, "step": 3560}, {"loss": 0.3967, "grad_norm": 1.0710467100143433, "learning_rate": 0.0002, "epoch": 4.385749385749386, "step": 3570}, {"loss": 0.3774, "grad_norm": 0.7974869012832642, "learning_rate": 0.0002, "epoch": 4.398034398034398, "step": 3580}, {"loss": 0.3738, "grad_norm": 0.9405913949012756, "learning_rate": 0.0002, "epoch": 4.41031941031941, "step": 3590}, {"loss": 0.3982, "grad_norm": 0.9393602609634399, "learning_rate": 0.0002, "epoch": 4.422604422604422, "step": 3600}, {"loss": 0.3913, "grad_norm": 1.0798007249832153, "learning_rate": 0.0002, "epoch": 4.434889434889435, "step": 3610}, {"loss": 0.3682, "grad_norm": 0.9226186275482178, "learning_rate": 0.0002, "epoch": 4.447174447174447, "step": 3620}, {"loss": 0.3742, "grad_norm": 1.1046524047851562, "learning_rate": 0.0002, "epoch": 4.45945945945946, "step": 3630}, {"loss": 0.3886, "grad_norm": 0.8848567605018616, "learning_rate": 0.0002, "epoch": 4.471744471744472, "step": 3640}, {"loss": 0.3848, "grad_norm": 0.8913224339485168, "learning_rate": 0.0002, "epoch": 4.484029484029484, "step": 3650}, {"loss": 0.3731, "grad_norm": 0.8497583270072937, "learning_rate": 0.0002, "epoch": 4.496314496314496, "step": 3660}, {"loss": 0.3804, "grad_norm": 0.8263831734657288, "learning_rate": 0.0002, "epoch": 4.5085995085995085, "step": 3670}, {"loss": 0.3815, "grad_norm": 0.8470269441604614, "learning_rate": 0.0002, "epoch": 4.520884520884521, "step": 3680}, {"loss": 0.3774, "grad_norm": 0.860038161277771, "learning_rate": 0.0002, "epoch": 4.533169533169533, "step": 3690}, {"loss": 0.3817, "grad_norm": 0.8898552656173706, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 3700}, {"loss": 0.3776, "grad_norm": 0.8152070641517639, "learning_rate": 0.0002, "epoch": 4.557739557739557, "step": 3710}, {"loss": 0.383, "grad_norm": 0.7847675085067749, "learning_rate": 0.0002, "epoch": 4.57002457002457, "step": 3720}, {"loss": 0.3791, "grad_norm": 0.9625533819198608, "learning_rate": 0.0002, "epoch": 4.582309582309582, "step": 3730}, {"loss": 0.3699, "grad_norm": 0.9097456336021423, "learning_rate": 0.0002, "epoch": 4.594594594594595, "step": 3740}, {"loss": 0.3673, "grad_norm": 0.871329128742218, "learning_rate": 0.0002, "epoch": 4.606879606879607, "step": 3750}, {"loss": 0.3725, "grad_norm": 0.9879975914955139, "learning_rate": 0.0002, "epoch": 4.61916461916462, "step": 3760}, {"loss": 0.3827, "grad_norm": 0.8636731505393982, "learning_rate": 0.0002, "epoch": 4.631449631449631, "step": 3770}, {"loss": 0.3755, "grad_norm": 1.0488964319229126, "learning_rate": 0.0002, "epoch": 4.643734643734644, "step": 3780}, {"loss": 0.3738, "grad_norm": 0.7637056112289429, "learning_rate": 0.0002, "epoch": 4.656019656019656, "step": 3790}, {"loss": 0.3676, "grad_norm": 0.8507546186447144, "learning_rate": 0.0002, "epoch": 4.6683046683046685, "step": 3800}, {"loss": 0.3852, "grad_norm": 1.0216856002807617, "learning_rate": 0.0002, "epoch": 4.680589680589681, "step": 3810}, {"loss": 0.3751, "grad_norm": 1.026343822479248, "learning_rate": 0.0002, "epoch": 4.6928746928746925, "step": 3820}, {"loss": 0.3687, "grad_norm": 0.8311620950698853, "learning_rate": 0.0002, "epoch": 4.705159705159705, "step": 3830}, {"loss": 0.3771, "grad_norm": 0.7770653367042542, "learning_rate": 0.0002, "epoch": 4.717444717444717, "step": 3840}, {"loss": 0.37, "grad_norm": 0.7616215348243713, "learning_rate": 0.0002, "epoch": 4.72972972972973, "step": 3850}, {"loss": 0.3927, "grad_norm": 1.0377072095870972, "learning_rate": 0.0002, "epoch": 4.742014742014742, "step": 3860}, {"loss": 0.3832, "grad_norm": 0.9713505506515503, "learning_rate": 0.0002, "epoch": 4.754299754299755, "step": 3870}, {"loss": 0.3722, "grad_norm": 0.8803321719169617, "learning_rate": 0.0002, "epoch": 4.766584766584766, "step": 3880}, {"loss": 0.3756, "grad_norm": 0.885535478591919, "learning_rate": 0.0002, "epoch": 4.778869778869779, "step": 3890}, {"loss": 0.3714, "grad_norm": 1.0877983570098877, "learning_rate": 0.0002, "epoch": 4.791154791154791, "step": 3900}, {"loss": 0.3879, "grad_norm": 0.7875366806983948, "learning_rate": 0.0002, "epoch": 4.803439803439804, "step": 3910}, {"loss": 0.3591, "grad_norm": 0.8550102114677429, "learning_rate": 0.0002, "epoch": 4.815724815724816, "step": 3920}, {"loss": 0.3716, "grad_norm": 1.0217846632003784, "learning_rate": 0.0002, "epoch": 4.828009828009828, "step": 3930}, {"loss": 0.3649, "grad_norm": 0.7315713167190552, "learning_rate": 0.0002, "epoch": 4.84029484029484, "step": 3940}, {"loss": 0.3879, "grad_norm": 0.8924923539161682, "learning_rate": 0.0002, "epoch": 4.8525798525798525, "step": 3950}, {"loss": 0.3669, "grad_norm": 0.9730218052864075, "learning_rate": 0.0002, "epoch": 4.864864864864865, "step": 3960}, {"loss": 0.3705, "grad_norm": 0.9202003479003906, "learning_rate": 0.0002, "epoch": 4.877149877149877, "step": 3970}, {"loss": 0.3617, "grad_norm": 0.8173081874847412, "learning_rate": 0.0002, "epoch": 4.88943488943489, "step": 3980}, {"loss": 0.37, "grad_norm": 0.7178564667701721, "learning_rate": 0.0002, "epoch": 4.901719901719901, "step": 3990}, {"loss": 0.3768, "grad_norm": 0.913684606552124, "learning_rate": 0.0002, "epoch": 4.914004914004914, "step": 4000}, {"loss": 0.3755, "grad_norm": 0.8817896842956543, "learning_rate": 0.0002, "epoch": 4.926289926289926, "step": 4010}, {"loss": 0.3676, "grad_norm": 0.7652186751365662, "learning_rate": 0.0002, "epoch": 4.938574938574939, "step": 4020}, {"loss": 0.3699, "grad_norm": 0.8828630447387695, "learning_rate": 0.0002, "epoch": 4.950859950859951, "step": 4030}, {"loss": 0.3672, "grad_norm": 1.0878605842590332, "learning_rate": 0.0002, "epoch": 4.963144963144963, "step": 4040}, {"loss": 0.3656, "grad_norm": 1.0845288038253784, "learning_rate": 0.0002, "epoch": 4.975429975429975, "step": 4050}, {"loss": 0.365, "grad_norm": 0.8431115746498108, "learning_rate": 0.0002, "epoch": 4.987714987714988, "step": 4060}, {"loss": 0.3693, "grad_norm": 0.8320387601852417, "learning_rate": 0.0002, "epoch": 5.0, "step": 4070}, {"eval_loss": 0.4017423093318939, "eval_runtime": 20.8466, "eval_samples_per_second": 15.878, "eval_steps_per_second": 2.015, "epoch": 5.0, "step": 4070}, {"loss": 0.3425, "grad_norm": 0.8639023900032043, "learning_rate": 0.0002, "epoch": 5.012285012285012, "step": 4080}, {"loss": 0.3458, "grad_norm": 0.7123713493347168, "learning_rate": 0.0002, "epoch": 5.024570024570025, "step": 4090}, {"loss": 0.3404, "grad_norm": 0.9886922836303711, "learning_rate": 0.0002, "epoch": 5.036855036855036, "step": 4100}, {"loss": 0.3529, "grad_norm": 0.7880306243896484, "learning_rate": 0.0002, "epoch": 5.049140049140049, "step": 4110}, {"loss": 0.3406, "grad_norm": 0.7488741874694824, "learning_rate": 0.0002, "epoch": 5.061425061425061, "step": 4120}, {"loss": 0.3542, "grad_norm": 0.9359086751937866, "learning_rate": 0.0002, "epoch": 5.073710073710074, "step": 4130}, {"loss": 0.3471, "grad_norm": 0.9401527047157288, "learning_rate": 0.0002, "epoch": 5.085995085995086, "step": 4140}, {"loss": 0.3566, "grad_norm": 0.8396275043487549, "learning_rate": 0.0002, "epoch": 5.098280098280099, "step": 4150}, {"loss": 0.3416, "grad_norm": 0.7132664918899536, "learning_rate": 0.0002, "epoch": 5.11056511056511, "step": 4160}, {"loss": 0.3457, "grad_norm": 0.843708872795105, "learning_rate": 0.0002, "epoch": 5.122850122850123, "step": 4170}, {"loss": 0.3399, "grad_norm": 0.8733304738998413, "learning_rate": 0.0002, "epoch": 5.135135135135135, "step": 4180}, {"loss": 0.3501, "grad_norm": 0.9064375162124634, "learning_rate": 0.0002, "epoch": 5.1474201474201475, "step": 4190}, {"loss": 0.3455, "grad_norm": 0.900770902633667, "learning_rate": 0.0002, "epoch": 5.15970515970516, "step": 4200}, {"loss": 0.3475, "grad_norm": 0.863853394985199, "learning_rate": 0.0002, "epoch": 5.171990171990172, "step": 4210}, {"loss": 0.3497, "grad_norm": 0.767134964466095, "learning_rate": 0.0002, "epoch": 5.184275184275184, "step": 4220}, {"loss": 0.3527, "grad_norm": 0.7518735527992249, "learning_rate": 0.0002, "epoch": 5.196560196560196, "step": 4230}, {"loss": 0.3369, "grad_norm": 0.8040947914123535, "learning_rate": 0.0002, "epoch": 5.208845208845209, "step": 4240}, {"loss": 0.3496, "grad_norm": 0.7827144265174866, "learning_rate": 0.0002, "epoch": 5.221130221130221, "step": 4250}, {"loss": 0.3442, "grad_norm": 0.7306333184242249, "learning_rate": 0.0002, "epoch": 5.233415233415234, "step": 4260}, {"loss": 0.3553, "grad_norm": 1.0963380336761475, "learning_rate": 0.0002, "epoch": 5.245700245700245, "step": 4270}, {"loss": 0.3462, "grad_norm": 0.8200454711914062, "learning_rate": 0.0002, "epoch": 5.257985257985258, "step": 4280}, {"loss": 0.3509, "grad_norm": 0.8666796684265137, "learning_rate": 0.0002, "epoch": 5.27027027027027, "step": 4290}, {"loss": 0.3423, "grad_norm": 0.7862894535064697, "learning_rate": 0.0002, "epoch": 5.282555282555283, "step": 4300}, {"loss": 0.3623, "grad_norm": 0.8163095712661743, "learning_rate": 0.0002, "epoch": 5.294840294840295, "step": 4310}, {"loss": 0.34, "grad_norm": 0.8069050908088684, "learning_rate": 0.0002, "epoch": 5.3071253071253075, "step": 4320}, {"loss": 0.3532, "grad_norm": 0.7858486175537109, "learning_rate": 0.0002, "epoch": 5.319410319410319, "step": 4330}, {"loss": 0.3435, "grad_norm": 0.950339674949646, "learning_rate": 0.0002, "epoch": 5.3316953316953315, "step": 4340}, {"loss": 0.3498, "grad_norm": 0.9056477546691895, "learning_rate": 0.0002, "epoch": 5.343980343980344, "step": 4350}, {"loss": 0.3538, "grad_norm": 0.9619399905204773, "learning_rate": 0.0002, "epoch": 5.356265356265356, "step": 4360}, {"loss": 0.3455, "grad_norm": 0.9778652191162109, "learning_rate": 0.0002, "epoch": 5.368550368550369, "step": 4370}, {"loss": 0.3498, "grad_norm": 0.6919555068016052, "learning_rate": 0.0002, "epoch": 5.38083538083538, "step": 4380}, {"loss": 0.3426, "grad_norm": 0.8121668696403503, "learning_rate": 0.0002, "epoch": 5.393120393120393, "step": 4390}, {"loss": 0.3442, "grad_norm": 0.8481289148330688, "learning_rate": 0.0002, "epoch": 5.405405405405405, "step": 4400}, {"loss": 0.345, "grad_norm": 0.8727408647537231, "learning_rate": 0.0002, "epoch": 5.417690417690418, "step": 4410}, {"loss": 0.3554, "grad_norm": 0.8920271396636963, "learning_rate": 0.0002, "epoch": 5.42997542997543, "step": 4420}, {"loss": 0.3409, "grad_norm": 0.7758749723434448, "learning_rate": 0.0002, "epoch": 5.442260442260443, "step": 4430}, {"loss": 0.3483, "grad_norm": 0.8847506642341614, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 4440}, {"loss": 0.3557, "grad_norm": 0.9760470390319824, "learning_rate": 0.0002, "epoch": 5.466830466830467, "step": 4450}, {"loss": 0.3536, "grad_norm": 0.8940271139144897, "learning_rate": 0.0002, "epoch": 5.479115479115479, "step": 4460}, {"loss": 0.3577, "grad_norm": 0.8668502569198608, "learning_rate": 0.0002, "epoch": 5.4914004914004915, "step": 4470}, {"loss": 0.3462, "grad_norm": 0.9097439050674438, "learning_rate": 0.0002, "epoch": 5.503685503685504, "step": 4480}, {"loss": 0.3417, "grad_norm": 0.8217208981513977, "learning_rate": 0.0002, "epoch": 5.515970515970516, "step": 4490}, {"loss": 0.3482, "grad_norm": 0.7853189706802368, "learning_rate": 0.0002, "epoch": 5.528255528255528, "step": 4500}, {"loss": 0.3479, "grad_norm": 1.1113477945327759, "learning_rate": 0.0002, "epoch": 5.54054054054054, "step": 4510}, {"loss": 0.3553, "grad_norm": 0.8637538552284241, "learning_rate": 0.0002, "epoch": 5.552825552825553, "step": 4520}, {"loss": 0.3403, "grad_norm": 1.0230066776275635, "learning_rate": 0.0002, "epoch": 5.565110565110565, "step": 4530}, {"loss": 0.3588, "grad_norm": 0.8972793817520142, "learning_rate": 0.0002, "epoch": 5.577395577395578, "step": 4540}, {"loss": 0.3428, "grad_norm": 0.7950642704963684, "learning_rate": 0.0002, "epoch": 5.58968058968059, "step": 4550}, {"loss": 0.3468, "grad_norm": 1.113753318786621, "learning_rate": 0.0002, "epoch": 5.601965601965602, "step": 4560}, {"loss": 0.3354, "grad_norm": 0.7842669486999512, "learning_rate": 0.0002, "epoch": 5.614250614250614, "step": 4570}, {"loss": 0.3419, "grad_norm": 0.9713512063026428, "learning_rate": 0.0002, "epoch": 5.6265356265356266, "step": 4580}, {"loss": 0.3502, "grad_norm": 0.9451650977134705, "learning_rate": 0.0002, "epoch": 5.638820638820639, "step": 4590}, {"loss": 0.3416, "grad_norm": 1.055484414100647, "learning_rate": 0.0002, "epoch": 5.651105651105651, "step": 4600}, {"loss": 0.3436, "grad_norm": 0.8408507704734802, "learning_rate": 0.0002, "epoch": 5.663390663390663, "step": 4610}, {"loss": 0.3619, "grad_norm": 1.0293926000595093, "learning_rate": 0.0002, "epoch": 5.675675675675675, "step": 4620}, {"loss": 0.3484, "grad_norm": 0.7198245525360107, "learning_rate": 0.0002, "epoch": 5.687960687960688, "step": 4630}, {"loss": 0.3563, "grad_norm": 0.7564466595649719, "learning_rate": 0.0002, "epoch": 5.7002457002457, "step": 4640}, {"loss": 0.3435, "grad_norm": 0.7980002760887146, "learning_rate": 0.0002, "epoch": 5.712530712530713, "step": 4650}, {"loss": 0.3478, "grad_norm": 0.8685088753700256, "learning_rate": 0.0002, "epoch": 5.724815724815725, "step": 4660}, {"loss": 0.3692, "grad_norm": 0.8816949129104614, "learning_rate": 0.0002, "epoch": 5.737100737100737, "step": 4670}, {"loss": 0.3462, "grad_norm": 0.7154731750488281, "learning_rate": 0.0002, "epoch": 5.749385749385749, "step": 4680}, {"loss": 0.3503, "grad_norm": 0.9430679678916931, "learning_rate": 0.0002, "epoch": 5.761670761670762, "step": 4690}, {"loss": 0.3439, "grad_norm": 0.7640151381492615, "learning_rate": 0.0002, "epoch": 5.773955773955774, "step": 4700}, {"loss": 0.3444, "grad_norm": 1.0920690298080444, "learning_rate": 0.0002, "epoch": 5.7862407862407865, "step": 4710}, {"loss": 0.3356, "grad_norm": 0.9362104535102844, "learning_rate": 0.0002, "epoch": 5.798525798525798, "step": 4720}, {"loss": 0.339, "grad_norm": 0.8392294645309448, "learning_rate": 0.0002, "epoch": 5.8108108108108105, "step": 4730}, {"loss": 0.3488, "grad_norm": 0.9893582463264465, "learning_rate": 0.0002, "epoch": 5.823095823095823, "step": 4740}, {"loss": 0.3446, "grad_norm": 0.6985510587692261, "learning_rate": 0.0002, "epoch": 5.835380835380835, "step": 4750}, {"loss": 0.3534, "grad_norm": 0.8906862735748291, "learning_rate": 0.0002, "epoch": 5.847665847665848, "step": 4760}, {"loss": 0.3481, "grad_norm": 0.8036413192749023, "learning_rate": 0.0002, "epoch": 5.85995085995086, "step": 4770}, {"loss": 0.3326, "grad_norm": 0.9948155283927917, "learning_rate": 0.0002, "epoch": 5.872235872235873, "step": 4780}, {"loss": 0.3385, "grad_norm": 0.8618432283401489, "learning_rate": 0.0002, "epoch": 5.884520884520884, "step": 4790}, {"loss": 0.3302, "grad_norm": 1.0422909259796143, "learning_rate": 0.0002, "epoch": 5.896805896805897, "step": 4800}, {"loss": 0.3448, "grad_norm": 1.1892569065093994, "learning_rate": 0.0002, "epoch": 5.909090909090909, "step": 4810}, {"loss": 0.3506, "grad_norm": 1.1459916830062866, "learning_rate": 0.0002, "epoch": 5.921375921375922, "step": 4820}, {"loss": 0.3387, "grad_norm": 1.056235909461975, "learning_rate": 0.0002, "epoch": 5.933660933660933, "step": 4830}, {"loss": 0.344, "grad_norm": 0.8517277240753174, "learning_rate": 0.0002, "epoch": 5.945945945945946, "step": 4840}, {"loss": 0.3421, "grad_norm": 0.8153380751609802, "learning_rate": 0.0002, "epoch": 5.958230958230958, "step": 4850}, {"loss": 0.3409, "grad_norm": 0.7907533049583435, "learning_rate": 0.0002, "epoch": 5.9705159705159705, "step": 4860}, {"loss": 0.3337, "grad_norm": 0.8443069458007812, "learning_rate": 0.0002, "epoch": 5.982800982800983, "step": 4870}, {"loss": 0.3351, "grad_norm": 0.8711344003677368, "learning_rate": 0.0002, "epoch": 5.995085995085995, "step": 4880}]} +{"epoch": 7.0, "step": 5698, "epoch_duration": 779.5834319591522, "total_accumulated_duration": 5272.349390506744, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-4884", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}, {"eval_loss": 0.49752503633499146, "eval_runtime": 20.2911, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.07, "epoch": 3.0, "step": 2442}, {"loss": 0.4362, "grad_norm": 0.7644656896591187, "learning_rate": 0.0002, "epoch": 3.0098280098280097, "step": 2450}, {"loss": 0.4363, "grad_norm": 0.9077525734901428, "learning_rate": 0.0002, "epoch": 3.022113022113022, "step": 2460}, {"loss": 0.422, "grad_norm": 0.7859287261962891, "learning_rate": 0.0002, "epoch": 3.0343980343980346, "step": 2470}, {"loss": 0.4574, "grad_norm": 1.1200323104858398, "learning_rate": 0.0002, "epoch": 3.0466830466830466, "step": 2480}, {"loss": 0.4519, "grad_norm": 0.7570453882217407, "learning_rate": 0.0002, "epoch": 3.058968058968059, "step": 2490}, {"loss": 0.4351, "grad_norm": 0.9450915455818176, "learning_rate": 0.0002, "epoch": 3.0712530712530715, "step": 2500}, {"loss": 0.4343, "grad_norm": 0.8303545117378235, "learning_rate": 0.0002, "epoch": 3.0835380835380835, "step": 2510}, {"loss": 0.4308, "grad_norm": 0.8864443898200989, "learning_rate": 0.0002, "epoch": 3.095823095823096, "step": 2520}, {"loss": 0.4601, "grad_norm": 0.945324718952179, "learning_rate": 0.0002, "epoch": 3.108108108108108, "step": 2530}, {"loss": 0.4345, "grad_norm": 1.0562494993209839, "learning_rate": 0.0002, "epoch": 3.1203931203931203, "step": 2540}, {"loss": 0.4375, "grad_norm": 0.8607500195503235, "learning_rate": 0.0002, "epoch": 3.1326781326781328, "step": 2550}, {"loss": 0.456, "grad_norm": 0.8719640374183655, "learning_rate": 0.0002, "epoch": 3.1449631449631448, "step": 2560}, {"loss": 0.4469, "grad_norm": 0.8647059202194214, "learning_rate": 0.0002, "epoch": 3.157248157248157, "step": 2570}, {"loss": 0.4483, "grad_norm": 0.8346507549285889, "learning_rate": 0.0002, "epoch": 3.1695331695331697, "step": 2580}, {"loss": 0.4331, "grad_norm": 1.0208854675292969, "learning_rate": 0.0002, "epoch": 3.1818181818181817, "step": 2590}, {"loss": 0.435, "grad_norm": 0.7064385414123535, "learning_rate": 0.0002, "epoch": 3.194103194103194, "step": 2600}, {"loss": 0.4541, "grad_norm": 0.927347719669342, "learning_rate": 0.0002, "epoch": 3.2063882063882065, "step": 2610}, {"loss": 0.4561, "grad_norm": 0.943517804145813, "learning_rate": 0.0002, "epoch": 3.2186732186732185, "step": 2620}, {"loss": 0.4225, "grad_norm": 0.7837198376655579, "learning_rate": 0.0002, "epoch": 3.230958230958231, "step": 2630}, {"loss": 0.4494, "grad_norm": 0.7752765417098999, "learning_rate": 0.0002, "epoch": 3.2432432432432434, "step": 2640}, {"loss": 0.4468, "grad_norm": 0.8578953146934509, "learning_rate": 0.0002, "epoch": 3.2555282555282554, "step": 2650}, {"loss": 0.4393, "grad_norm": 1.0209529399871826, "learning_rate": 0.0002, "epoch": 3.267813267813268, "step": 2660}, {"loss": 0.4517, "grad_norm": 0.9069030284881592, "learning_rate": 0.0002, "epoch": 3.2800982800982803, "step": 2670}, {"loss": 0.4262, "grad_norm": 0.8454729318618774, "learning_rate": 0.0002, "epoch": 3.2923832923832923, "step": 2680}, {"loss": 0.4349, "grad_norm": 0.8253099322319031, "learning_rate": 0.0002, "epoch": 3.3046683046683047, "step": 2690}, {"loss": 0.4503, "grad_norm": 0.8765934109687805, "learning_rate": 0.0002, "epoch": 3.3169533169533167, "step": 2700}, {"loss": 0.4518, "grad_norm": 0.8149126172065735, "learning_rate": 0.0002, "epoch": 3.329238329238329, "step": 2710}, {"loss": 0.4437, "grad_norm": 0.8820102214813232, "learning_rate": 0.0002, "epoch": 3.3415233415233416, "step": 2720}, {"loss": 0.4346, "grad_norm": 0.8813952803611755, "learning_rate": 0.0002, "epoch": 3.3538083538083536, "step": 2730}, {"loss": 0.4396, "grad_norm": 1.0338447093963623, "learning_rate": 0.0002, "epoch": 3.366093366093366, "step": 2740}, {"loss": 0.4468, "grad_norm": 0.8780209422111511, "learning_rate": 0.0002, "epoch": 3.3783783783783785, "step": 2750}, {"loss": 0.441, "grad_norm": 0.9017151594161987, "learning_rate": 0.0002, "epoch": 3.3906633906633905, "step": 2760}, {"loss": 0.446, "grad_norm": 0.8647638559341431, "learning_rate": 0.0002, "epoch": 3.402948402948403, "step": 2770}, {"loss": 0.4131, "grad_norm": 0.8298183679580688, "learning_rate": 0.0002, "epoch": 3.4152334152334154, "step": 2780}, {"loss": 0.4406, "grad_norm": 0.9298108816146851, "learning_rate": 0.0002, "epoch": 3.4275184275184274, "step": 2790}, {"loss": 0.4145, "grad_norm": 0.8909980058670044, "learning_rate": 0.0002, "epoch": 3.43980343980344, "step": 2800}, {"loss": 0.4148, "grad_norm": 0.8027496933937073, "learning_rate": 0.0002, "epoch": 3.4520884520884523, "step": 2810}, {"loss": 0.4244, "grad_norm": 0.8766195774078369, "learning_rate": 0.0002, "epoch": 3.4643734643734643, "step": 2820}, {"loss": 0.4292, "grad_norm": 0.8194443583488464, "learning_rate": 0.0002, "epoch": 3.4766584766584767, "step": 2830}, {"loss": 0.4305, "grad_norm": 0.9862873554229736, "learning_rate": 0.0002, "epoch": 3.488943488943489, "step": 2840}, {"loss": 0.4393, "grad_norm": 0.8755377531051636, "learning_rate": 0.0002, "epoch": 3.501228501228501, "step": 2850}, {"loss": 0.4231, "grad_norm": 0.7300266027450562, "learning_rate": 0.0002, "epoch": 3.5135135135135136, "step": 2860}, {"loss": 0.4278, "grad_norm": 0.8342461585998535, "learning_rate": 0.0002, "epoch": 3.5257985257985256, "step": 2870}, {"loss": 0.4395, "grad_norm": 0.8624151349067688, "learning_rate": 0.0002, "epoch": 3.538083538083538, "step": 2880}, {"loss": 0.4064, "grad_norm": 0.8931261301040649, "learning_rate": 0.0002, "epoch": 3.5503685503685505, "step": 2890}, {"loss": 0.4358, "grad_norm": 0.8617086410522461, "learning_rate": 0.0002, "epoch": 3.562653562653563, "step": 2900}, {"loss": 0.419, "grad_norm": 0.8754099607467651, "learning_rate": 0.0002, "epoch": 3.574938574938575, "step": 2910}, {"loss": 0.4275, "grad_norm": 0.8345834612846375, "learning_rate": 0.0002, "epoch": 3.5872235872235874, "step": 2920}, {"loss": 0.4375, "grad_norm": 1.1414062976837158, "learning_rate": 0.0002, "epoch": 3.5995085995085994, "step": 2930}, {"loss": 0.4297, "grad_norm": 0.994860053062439, "learning_rate": 0.0002, "epoch": 3.611793611793612, "step": 2940}, {"loss": 0.4386, "grad_norm": 1.19268000125885, "learning_rate": 0.0002, "epoch": 3.6240786240786242, "step": 2950}, {"loss": 0.4029, "grad_norm": 0.8399543762207031, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 2960}, {"loss": 0.4432, "grad_norm": 0.9873217940330505, "learning_rate": 0.0002, "epoch": 3.6486486486486487, "step": 2970}, {"loss": 0.4308, "grad_norm": 0.9116013646125793, "learning_rate": 0.0002, "epoch": 3.6609336609336607, "step": 2980}, {"loss": 0.4275, "grad_norm": 0.9503833651542664, "learning_rate": 0.0002, "epoch": 3.673218673218673, "step": 2990}, {"loss": 0.4306, "grad_norm": 0.9401112794876099, "learning_rate": 0.0002, "epoch": 3.6855036855036856, "step": 3000}, {"loss": 0.4333, "grad_norm": 1.00745689868927, "learning_rate": 0.0002, "epoch": 3.697788697788698, "step": 3010}, {"loss": 0.432, "grad_norm": 1.0553191900253296, "learning_rate": 0.0002, "epoch": 3.71007371007371, "step": 3020}, {"loss": 0.4321, "grad_norm": 1.0226953029632568, "learning_rate": 0.0002, "epoch": 3.7223587223587224, "step": 3030}, {"loss": 0.418, "grad_norm": 1.085554838180542, "learning_rate": 0.0002, "epoch": 3.7346437346437344, "step": 3040}, {"loss": 0.4196, "grad_norm": 0.9948731064796448, "learning_rate": 0.0002, "epoch": 3.746928746928747, "step": 3050}, {"loss": 0.4281, "grad_norm": 0.9328727126121521, "learning_rate": 0.0002, "epoch": 3.7592137592137593, "step": 3060}, {"loss": 0.4284, "grad_norm": 1.0533266067504883, "learning_rate": 0.0002, "epoch": 3.7714987714987718, "step": 3070}, {"loss": 0.4414, "grad_norm": 0.8213809132575989, "learning_rate": 0.0002, "epoch": 3.7837837837837838, "step": 3080}, {"loss": 0.4348, "grad_norm": 0.8941594362258911, "learning_rate": 0.0002, "epoch": 3.796068796068796, "step": 3090}, {"loss": 0.4266, "grad_norm": 0.8324518203735352, "learning_rate": 0.0002, "epoch": 3.808353808353808, "step": 3100}, {"loss": 0.4227, "grad_norm": 0.8811233639717102, "learning_rate": 0.0002, "epoch": 3.8206388206388207, "step": 3110}, {"loss": 0.4195, "grad_norm": 0.8781470060348511, "learning_rate": 0.0002, "epoch": 3.832923832923833, "step": 3120}, {"loss": 0.4277, "grad_norm": 0.8994116187095642, "learning_rate": 0.0002, "epoch": 3.845208845208845, "step": 3130}, {"loss": 0.4149, "grad_norm": 0.8605017066001892, "learning_rate": 0.0002, "epoch": 3.8574938574938575, "step": 3140}, {"loss": 0.4023, "grad_norm": 0.8966400027275085, "learning_rate": 0.0002, "epoch": 3.8697788697788695, "step": 3150}, {"loss": 0.4245, "grad_norm": 0.8856554627418518, "learning_rate": 0.0002, "epoch": 3.882063882063882, "step": 3160}, {"loss": 0.4101, "grad_norm": 0.8971620798110962, "learning_rate": 0.0002, "epoch": 3.8943488943488944, "step": 3170}, {"loss": 0.3993, "grad_norm": 0.9807813167572021, "learning_rate": 0.0002, "epoch": 3.906633906633907, "step": 3180}, {"loss": 0.4258, "grad_norm": 0.8614121675491333, "learning_rate": 0.0002, "epoch": 3.918918918918919, "step": 3190}, {"loss": 0.4115, "grad_norm": 0.989171028137207, "learning_rate": 0.0002, "epoch": 3.9312039312039313, "step": 3200}, {"loss": 0.4182, "grad_norm": 0.8168872594833374, "learning_rate": 0.0002, "epoch": 3.9434889434889433, "step": 3210}, {"loss": 0.4112, "grad_norm": 0.8109386563301086, "learning_rate": 0.0002, "epoch": 3.9557739557739557, "step": 3220}, {"loss": 0.4165, "grad_norm": 1.0175853967666626, "learning_rate": 0.0002, "epoch": 3.968058968058968, "step": 3230}, {"loss": 0.4146, "grad_norm": 0.936143159866333, "learning_rate": 0.0002, "epoch": 3.98034398034398, "step": 3240}, {"loss": 0.4163, "grad_norm": 0.9557915925979614, "learning_rate": 0.0002, "epoch": 3.9926289926289926, "step": 3250}, {"eval_loss": 0.4401616156101227, "eval_runtime": 20.8047, "eval_samples_per_second": 15.91, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 3256}, {"loss": 0.408, "grad_norm": 0.7590614557266235, "learning_rate": 0.0002, "epoch": 4.004914004914005, "step": 3260}, {"loss": 0.4001, "grad_norm": 0.8920791149139404, "learning_rate": 0.0002, "epoch": 4.017199017199017, "step": 3270}, {"loss": 0.3789, "grad_norm": 0.8640421628952026, "learning_rate": 0.0002, "epoch": 4.0294840294840295, "step": 3280}, {"loss": 0.3791, "grad_norm": 0.9074113965034485, "learning_rate": 0.0002, "epoch": 4.041769041769042, "step": 3290}, {"loss": 0.3728, "grad_norm": 1.0600885152816772, "learning_rate": 0.0002, "epoch": 4.054054054054054, "step": 3300}, {"loss": 0.3857, "grad_norm": 0.9682773351669312, "learning_rate": 0.0002, "epoch": 4.066339066339066, "step": 3310}, {"loss": 0.4007, "grad_norm": 0.9326395392417908, "learning_rate": 0.0002, "epoch": 4.078624078624078, "step": 3320}, {"loss": 0.3823, "grad_norm": 0.8886597156524658, "learning_rate": 0.0002, "epoch": 4.090909090909091, "step": 3330}, {"loss": 0.3929, "grad_norm": 1.032205581665039, "learning_rate": 0.0002, "epoch": 4.103194103194103, "step": 3340}, {"loss": 0.3836, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.115479115479116, "step": 3350}, {"loss": 0.3866, "grad_norm": 0.8250347971916199, "learning_rate": 0.0002, "epoch": 4.127764127764128, "step": 3360}, {"loss": 0.3826, "grad_norm": 0.7919842600822449, "learning_rate": 0.0002, "epoch": 4.14004914004914, "step": 3370}, {"loss": 0.3838, "grad_norm": 1.045682430267334, "learning_rate": 0.0002, "epoch": 4.152334152334152, "step": 3380}, {"loss": 0.3796, "grad_norm": 0.6873571276664734, "learning_rate": 0.0002, "epoch": 4.164619164619165, "step": 3390}, {"loss": 0.3942, "grad_norm": 1.0227675437927246, "learning_rate": 0.0002, "epoch": 4.176904176904177, "step": 3400}, {"loss": 0.3788, "grad_norm": 0.9167711734771729, "learning_rate": 0.0002, "epoch": 4.1891891891891895, "step": 3410}, {"loss": 0.3792, "grad_norm": 1.0598796606063843, "learning_rate": 0.0002, "epoch": 4.201474201474202, "step": 3420}, {"loss": 0.3955, "grad_norm": 0.8581843972206116, "learning_rate": 0.0002, "epoch": 4.2137592137592135, "step": 3430}, {"loss": 0.3761, "grad_norm": 0.8862360119819641, "learning_rate": 0.0002, "epoch": 4.226044226044226, "step": 3440}, {"loss": 0.3889, "grad_norm": 1.0248323678970337, "learning_rate": 0.0002, "epoch": 4.238329238329238, "step": 3450}, {"loss": 0.3827, "grad_norm": 0.8746261596679688, "learning_rate": 0.0002, "epoch": 4.250614250614251, "step": 3460}, {"loss": 0.3949, "grad_norm": 0.7442536354064941, "learning_rate": 0.0002, "epoch": 4.262899262899263, "step": 3470}, {"loss": 0.3761, "grad_norm": 0.8295119404792786, "learning_rate": 0.0002, "epoch": 4.275184275184275, "step": 3480}, {"loss": 0.3895, "grad_norm": 1.0634245872497559, "learning_rate": 0.0002, "epoch": 4.287469287469287, "step": 3490}, {"loss": 0.3955, "grad_norm": 0.9554621577262878, "learning_rate": 0.0002, "epoch": 4.2997542997543, "step": 3500}, {"loss": 0.3826, "grad_norm": 1.0191723108291626, "learning_rate": 0.0002, "epoch": 4.312039312039312, "step": 3510}, {"loss": 0.3828, "grad_norm": 0.8573611378669739, "learning_rate": 0.0002, "epoch": 4.324324324324325, "step": 3520}, {"loss": 0.3869, "grad_norm": 0.9082390069961548, "learning_rate": 0.0002, "epoch": 4.336609336609337, "step": 3530}, {"loss": 0.3902, "grad_norm": 0.8650212287902832, "learning_rate": 0.0002, "epoch": 4.348894348894349, "step": 3540}, {"loss": 0.3915, "grad_norm": 0.7186297178268433, "learning_rate": 0.0002, "epoch": 4.361179361179361, "step": 3550}, {"loss": 0.3861, "grad_norm": 0.9750986695289612, "learning_rate": 0.0002, "epoch": 4.3734643734643734, "step": 3560}, {"loss": 0.3967, "grad_norm": 1.0710467100143433, "learning_rate": 0.0002, "epoch": 4.385749385749386, "step": 3570}, {"loss": 0.3774, "grad_norm": 0.7974869012832642, "learning_rate": 0.0002, "epoch": 4.398034398034398, "step": 3580}, {"loss": 0.3738, "grad_norm": 0.9405913949012756, "learning_rate": 0.0002, "epoch": 4.41031941031941, "step": 3590}, {"loss": 0.3982, "grad_norm": 0.9393602609634399, "learning_rate": 0.0002, "epoch": 4.422604422604422, "step": 3600}, {"loss": 0.3913, "grad_norm": 1.0798007249832153, "learning_rate": 0.0002, "epoch": 4.434889434889435, "step": 3610}, {"loss": 0.3682, "grad_norm": 0.9226186275482178, "learning_rate": 0.0002, "epoch": 4.447174447174447, "step": 3620}, {"loss": 0.3742, "grad_norm": 1.1046524047851562, "learning_rate": 0.0002, "epoch": 4.45945945945946, "step": 3630}, {"loss": 0.3886, "grad_norm": 0.8848567605018616, "learning_rate": 0.0002, "epoch": 4.471744471744472, "step": 3640}, {"loss": 0.3848, "grad_norm": 0.8913224339485168, "learning_rate": 0.0002, "epoch": 4.484029484029484, "step": 3650}, {"loss": 0.3731, "grad_norm": 0.8497583270072937, "learning_rate": 0.0002, "epoch": 4.496314496314496, "step": 3660}, {"loss": 0.3804, "grad_norm": 0.8263831734657288, "learning_rate": 0.0002, "epoch": 4.5085995085995085, "step": 3670}, {"loss": 0.3815, "grad_norm": 0.8470269441604614, "learning_rate": 0.0002, "epoch": 4.520884520884521, "step": 3680}, {"loss": 0.3774, "grad_norm": 0.860038161277771, "learning_rate": 0.0002, "epoch": 4.533169533169533, "step": 3690}, {"loss": 0.3817, "grad_norm": 0.8898552656173706, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 3700}, {"loss": 0.3776, "grad_norm": 0.8152070641517639, "learning_rate": 0.0002, "epoch": 4.557739557739557, "step": 3710}, {"loss": 0.383, "grad_norm": 0.7847675085067749, "learning_rate": 0.0002, "epoch": 4.57002457002457, "step": 3720}, {"loss": 0.3791, "grad_norm": 0.9625533819198608, "learning_rate": 0.0002, "epoch": 4.582309582309582, "step": 3730}, {"loss": 0.3699, "grad_norm": 0.9097456336021423, "learning_rate": 0.0002, "epoch": 4.594594594594595, "step": 3740}, {"loss": 0.3673, "grad_norm": 0.871329128742218, "learning_rate": 0.0002, "epoch": 4.606879606879607, "step": 3750}, {"loss": 0.3725, "grad_norm": 0.9879975914955139, "learning_rate": 0.0002, "epoch": 4.61916461916462, "step": 3760}, {"loss": 0.3827, "grad_norm": 0.8636731505393982, "learning_rate": 0.0002, "epoch": 4.631449631449631, "step": 3770}, {"loss": 0.3755, "grad_norm": 1.0488964319229126, "learning_rate": 0.0002, "epoch": 4.643734643734644, "step": 3780}, {"loss": 0.3738, "grad_norm": 0.7637056112289429, "learning_rate": 0.0002, "epoch": 4.656019656019656, "step": 3790}, {"loss": 0.3676, "grad_norm": 0.8507546186447144, "learning_rate": 0.0002, "epoch": 4.6683046683046685, "step": 3800}, {"loss": 0.3852, "grad_norm": 1.0216856002807617, "learning_rate": 0.0002, "epoch": 4.680589680589681, "step": 3810}, {"loss": 0.3751, "grad_norm": 1.026343822479248, "learning_rate": 0.0002, "epoch": 4.6928746928746925, "step": 3820}, {"loss": 0.3687, "grad_norm": 0.8311620950698853, "learning_rate": 0.0002, "epoch": 4.705159705159705, "step": 3830}, {"loss": 0.3771, "grad_norm": 0.7770653367042542, "learning_rate": 0.0002, "epoch": 4.717444717444717, "step": 3840}, {"loss": 0.37, "grad_norm": 0.7616215348243713, "learning_rate": 0.0002, "epoch": 4.72972972972973, "step": 3850}, {"loss": 0.3927, "grad_norm": 1.0377072095870972, "learning_rate": 0.0002, "epoch": 4.742014742014742, "step": 3860}, {"loss": 0.3832, "grad_norm": 0.9713505506515503, "learning_rate": 0.0002, "epoch": 4.754299754299755, "step": 3870}, {"loss": 0.3722, "grad_norm": 0.8803321719169617, "learning_rate": 0.0002, "epoch": 4.766584766584766, "step": 3880}, {"loss": 0.3756, "grad_norm": 0.885535478591919, "learning_rate": 0.0002, "epoch": 4.778869778869779, "step": 3890}, {"loss": 0.3714, "grad_norm": 1.0877983570098877, "learning_rate": 0.0002, "epoch": 4.791154791154791, "step": 3900}, {"loss": 0.3879, "grad_norm": 0.7875366806983948, "learning_rate": 0.0002, "epoch": 4.803439803439804, "step": 3910}, {"loss": 0.3591, "grad_norm": 0.8550102114677429, "learning_rate": 0.0002, "epoch": 4.815724815724816, "step": 3920}, {"loss": 0.3716, "grad_norm": 1.0217846632003784, "learning_rate": 0.0002, "epoch": 4.828009828009828, "step": 3930}, {"loss": 0.3649, "grad_norm": 0.7315713167190552, "learning_rate": 0.0002, "epoch": 4.84029484029484, "step": 3940}, {"loss": 0.3879, "grad_norm": 0.8924923539161682, "learning_rate": 0.0002, "epoch": 4.8525798525798525, "step": 3950}, {"loss": 0.3669, "grad_norm": 0.9730218052864075, "learning_rate": 0.0002, "epoch": 4.864864864864865, "step": 3960}, {"loss": 0.3705, "grad_norm": 0.9202003479003906, "learning_rate": 0.0002, "epoch": 4.877149877149877, "step": 3970}, {"loss": 0.3617, "grad_norm": 0.8173081874847412, "learning_rate": 0.0002, "epoch": 4.88943488943489, "step": 3980}, {"loss": 0.37, "grad_norm": 0.7178564667701721, "learning_rate": 0.0002, "epoch": 4.901719901719901, "step": 3990}, {"loss": 0.3768, "grad_norm": 0.913684606552124, "learning_rate": 0.0002, "epoch": 4.914004914004914, "step": 4000}, {"loss": 0.3755, "grad_norm": 0.8817896842956543, "learning_rate": 0.0002, "epoch": 4.926289926289926, "step": 4010}, {"loss": 0.3676, "grad_norm": 0.7652186751365662, "learning_rate": 0.0002, "epoch": 4.938574938574939, "step": 4020}, {"loss": 0.3699, "grad_norm": 0.8828630447387695, "learning_rate": 0.0002, "epoch": 4.950859950859951, "step": 4030}, {"loss": 0.3672, "grad_norm": 1.0878605842590332, "learning_rate": 0.0002, "epoch": 4.963144963144963, "step": 4040}, {"loss": 0.3656, "grad_norm": 1.0845288038253784, "learning_rate": 0.0002, "epoch": 4.975429975429975, "step": 4050}, {"loss": 0.365, "grad_norm": 0.8431115746498108, "learning_rate": 0.0002, "epoch": 4.987714987714988, "step": 4060}, {"loss": 0.3693, "grad_norm": 0.8320387601852417, "learning_rate": 0.0002, "epoch": 5.0, "step": 4070}, {"eval_loss": 0.4017423093318939, "eval_runtime": 20.8466, "eval_samples_per_second": 15.878, "eval_steps_per_second": 2.015, "epoch": 5.0, "step": 4070}, {"loss": 0.3425, "grad_norm": 0.8639023900032043, "learning_rate": 0.0002, "epoch": 5.012285012285012, "step": 4080}, {"loss": 0.3458, "grad_norm": 0.7123713493347168, "learning_rate": 0.0002, "epoch": 5.024570024570025, "step": 4090}, {"loss": 0.3404, "grad_norm": 0.9886922836303711, "learning_rate": 0.0002, "epoch": 5.036855036855036, "step": 4100}, {"loss": 0.3529, "grad_norm": 0.7880306243896484, "learning_rate": 0.0002, "epoch": 5.049140049140049, "step": 4110}, {"loss": 0.3406, "grad_norm": 0.7488741874694824, "learning_rate": 0.0002, "epoch": 5.061425061425061, "step": 4120}, {"loss": 0.3542, "grad_norm": 0.9359086751937866, "learning_rate": 0.0002, "epoch": 5.073710073710074, "step": 4130}, {"loss": 0.3471, "grad_norm": 0.9401527047157288, "learning_rate": 0.0002, "epoch": 5.085995085995086, "step": 4140}, {"loss": 0.3566, "grad_norm": 0.8396275043487549, "learning_rate": 0.0002, "epoch": 5.098280098280099, "step": 4150}, {"loss": 0.3416, "grad_norm": 0.7132664918899536, "learning_rate": 0.0002, "epoch": 5.11056511056511, "step": 4160}, {"loss": 0.3457, "grad_norm": 0.843708872795105, "learning_rate": 0.0002, "epoch": 5.122850122850123, "step": 4170}, {"loss": 0.3399, "grad_norm": 0.8733304738998413, "learning_rate": 0.0002, "epoch": 5.135135135135135, "step": 4180}, {"loss": 0.3501, "grad_norm": 0.9064375162124634, "learning_rate": 0.0002, "epoch": 5.1474201474201475, "step": 4190}, {"loss": 0.3455, "grad_norm": 0.900770902633667, "learning_rate": 0.0002, "epoch": 5.15970515970516, "step": 4200}, {"loss": 0.3475, "grad_norm": 0.863853394985199, "learning_rate": 0.0002, "epoch": 5.171990171990172, "step": 4210}, {"loss": 0.3497, "grad_norm": 0.767134964466095, "learning_rate": 0.0002, "epoch": 5.184275184275184, "step": 4220}, {"loss": 0.3527, "grad_norm": 0.7518735527992249, "learning_rate": 0.0002, "epoch": 5.196560196560196, "step": 4230}, {"loss": 0.3369, "grad_norm": 0.8040947914123535, "learning_rate": 0.0002, "epoch": 5.208845208845209, "step": 4240}, {"loss": 0.3496, "grad_norm": 0.7827144265174866, "learning_rate": 0.0002, "epoch": 5.221130221130221, "step": 4250}, {"loss": 0.3442, "grad_norm": 0.7306333184242249, "learning_rate": 0.0002, "epoch": 5.233415233415234, "step": 4260}, {"loss": 0.3553, "grad_norm": 1.0963380336761475, "learning_rate": 0.0002, "epoch": 5.245700245700245, "step": 4270}, {"loss": 0.3462, "grad_norm": 0.8200454711914062, "learning_rate": 0.0002, "epoch": 5.257985257985258, "step": 4280}, {"loss": 0.3509, "grad_norm": 0.8666796684265137, "learning_rate": 0.0002, "epoch": 5.27027027027027, "step": 4290}, {"loss": 0.3423, "grad_norm": 0.7862894535064697, "learning_rate": 0.0002, "epoch": 5.282555282555283, "step": 4300}, {"loss": 0.3623, "grad_norm": 0.8163095712661743, "learning_rate": 0.0002, "epoch": 5.294840294840295, "step": 4310}, {"loss": 0.34, "grad_norm": 0.8069050908088684, "learning_rate": 0.0002, "epoch": 5.3071253071253075, "step": 4320}, {"loss": 0.3532, "grad_norm": 0.7858486175537109, "learning_rate": 0.0002, "epoch": 5.319410319410319, "step": 4330}, {"loss": 0.3435, "grad_norm": 0.950339674949646, "learning_rate": 0.0002, "epoch": 5.3316953316953315, "step": 4340}, {"loss": 0.3498, "grad_norm": 0.9056477546691895, "learning_rate": 0.0002, "epoch": 5.343980343980344, "step": 4350}, {"loss": 0.3538, "grad_norm": 0.9619399905204773, "learning_rate": 0.0002, "epoch": 5.356265356265356, "step": 4360}, {"loss": 0.3455, "grad_norm": 0.9778652191162109, "learning_rate": 0.0002, "epoch": 5.368550368550369, "step": 4370}, {"loss": 0.3498, "grad_norm": 0.6919555068016052, "learning_rate": 0.0002, "epoch": 5.38083538083538, "step": 4380}, {"loss": 0.3426, "grad_norm": 0.8121668696403503, "learning_rate": 0.0002, "epoch": 5.393120393120393, "step": 4390}, {"loss": 0.3442, "grad_norm": 0.8481289148330688, "learning_rate": 0.0002, "epoch": 5.405405405405405, "step": 4400}, {"loss": 0.345, "grad_norm": 0.8727408647537231, "learning_rate": 0.0002, "epoch": 5.417690417690418, "step": 4410}, {"loss": 0.3554, "grad_norm": 0.8920271396636963, "learning_rate": 0.0002, "epoch": 5.42997542997543, "step": 4420}, {"loss": 0.3409, "grad_norm": 0.7758749723434448, "learning_rate": 0.0002, "epoch": 5.442260442260443, "step": 4430}, {"loss": 0.3483, "grad_norm": 0.8847506642341614, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 4440}, {"loss": 0.3557, "grad_norm": 0.9760470390319824, "learning_rate": 0.0002, "epoch": 5.466830466830467, "step": 4450}, {"loss": 0.3536, "grad_norm": 0.8940271139144897, "learning_rate": 0.0002, "epoch": 5.479115479115479, "step": 4460}, {"loss": 0.3577, "grad_norm": 0.8668502569198608, "learning_rate": 0.0002, "epoch": 5.4914004914004915, "step": 4470}, {"loss": 0.3462, "grad_norm": 0.9097439050674438, "learning_rate": 0.0002, "epoch": 5.503685503685504, "step": 4480}, {"loss": 0.3417, "grad_norm": 0.8217208981513977, "learning_rate": 0.0002, "epoch": 5.515970515970516, "step": 4490}, {"loss": 0.3482, "grad_norm": 0.7853189706802368, "learning_rate": 0.0002, "epoch": 5.528255528255528, "step": 4500}, {"loss": 0.3479, "grad_norm": 1.1113477945327759, "learning_rate": 0.0002, "epoch": 5.54054054054054, "step": 4510}, {"loss": 0.3553, "grad_norm": 0.8637538552284241, "learning_rate": 0.0002, "epoch": 5.552825552825553, "step": 4520}, {"loss": 0.3403, "grad_norm": 1.0230066776275635, "learning_rate": 0.0002, "epoch": 5.565110565110565, "step": 4530}, {"loss": 0.3588, "grad_norm": 0.8972793817520142, "learning_rate": 0.0002, "epoch": 5.577395577395578, "step": 4540}, {"loss": 0.3428, "grad_norm": 0.7950642704963684, "learning_rate": 0.0002, "epoch": 5.58968058968059, "step": 4550}, {"loss": 0.3468, "grad_norm": 1.113753318786621, "learning_rate": 0.0002, "epoch": 5.601965601965602, "step": 4560}, {"loss": 0.3354, "grad_norm": 0.7842669486999512, "learning_rate": 0.0002, "epoch": 5.614250614250614, "step": 4570}, {"loss": 0.3419, "grad_norm": 0.9713512063026428, "learning_rate": 0.0002, "epoch": 5.6265356265356266, "step": 4580}, {"loss": 0.3502, "grad_norm": 0.9451650977134705, "learning_rate": 0.0002, "epoch": 5.638820638820639, "step": 4590}, {"loss": 0.3416, "grad_norm": 1.055484414100647, "learning_rate": 0.0002, "epoch": 5.651105651105651, "step": 4600}, {"loss": 0.3436, "grad_norm": 0.8408507704734802, "learning_rate": 0.0002, "epoch": 5.663390663390663, "step": 4610}, {"loss": 0.3619, "grad_norm": 1.0293926000595093, "learning_rate": 0.0002, "epoch": 5.675675675675675, "step": 4620}, {"loss": 0.3484, "grad_norm": 0.7198245525360107, "learning_rate": 0.0002, "epoch": 5.687960687960688, "step": 4630}, {"loss": 0.3563, "grad_norm": 0.7564466595649719, "learning_rate": 0.0002, "epoch": 5.7002457002457, "step": 4640}, {"loss": 0.3435, "grad_norm": 0.7980002760887146, "learning_rate": 0.0002, "epoch": 5.712530712530713, "step": 4650}, {"loss": 0.3478, "grad_norm": 0.8685088753700256, "learning_rate": 0.0002, "epoch": 5.724815724815725, "step": 4660}, {"loss": 0.3692, "grad_norm": 0.8816949129104614, "learning_rate": 0.0002, "epoch": 5.737100737100737, "step": 4670}, {"loss": 0.3462, "grad_norm": 0.7154731750488281, "learning_rate": 0.0002, "epoch": 5.749385749385749, "step": 4680}, {"loss": 0.3503, "grad_norm": 0.9430679678916931, "learning_rate": 0.0002, "epoch": 5.761670761670762, "step": 4690}, {"loss": 0.3439, "grad_norm": 0.7640151381492615, "learning_rate": 0.0002, "epoch": 5.773955773955774, "step": 4700}, {"loss": 0.3444, "grad_norm": 1.0920690298080444, "learning_rate": 0.0002, "epoch": 5.7862407862407865, "step": 4710}, {"loss": 0.3356, "grad_norm": 0.9362104535102844, "learning_rate": 0.0002, "epoch": 5.798525798525798, "step": 4720}, {"loss": 0.339, "grad_norm": 0.8392294645309448, "learning_rate": 0.0002, "epoch": 5.8108108108108105, "step": 4730}, {"loss": 0.3488, "grad_norm": 0.9893582463264465, "learning_rate": 0.0002, "epoch": 5.823095823095823, "step": 4740}, {"loss": 0.3446, "grad_norm": 0.6985510587692261, "learning_rate": 0.0002, "epoch": 5.835380835380835, "step": 4750}, {"loss": 0.3534, "grad_norm": 0.8906862735748291, "learning_rate": 0.0002, "epoch": 5.847665847665848, "step": 4760}, {"loss": 0.3481, "grad_norm": 0.8036413192749023, "learning_rate": 0.0002, "epoch": 5.85995085995086, "step": 4770}, {"loss": 0.3326, "grad_norm": 0.9948155283927917, "learning_rate": 0.0002, "epoch": 5.872235872235873, "step": 4780}, {"loss": 0.3385, "grad_norm": 0.8618432283401489, "learning_rate": 0.0002, "epoch": 5.884520884520884, "step": 4790}, {"loss": 0.3302, "grad_norm": 1.0422909259796143, "learning_rate": 0.0002, "epoch": 5.896805896805897, "step": 4800}, {"loss": 0.3448, "grad_norm": 1.1892569065093994, "learning_rate": 0.0002, "epoch": 5.909090909090909, "step": 4810}, {"loss": 0.3506, "grad_norm": 1.1459916830062866, "learning_rate": 0.0002, "epoch": 5.921375921375922, "step": 4820}, {"loss": 0.3387, "grad_norm": 1.056235909461975, "learning_rate": 0.0002, "epoch": 5.933660933660933, "step": 4830}, {"loss": 0.344, "grad_norm": 0.8517277240753174, "learning_rate": 0.0002, "epoch": 5.945945945945946, "step": 4840}, {"loss": 0.3421, "grad_norm": 0.8153380751609802, "learning_rate": 0.0002, "epoch": 5.958230958230958, "step": 4850}, {"loss": 0.3409, "grad_norm": 0.7907533049583435, "learning_rate": 0.0002, "epoch": 5.9705159705159705, "step": 4860}, {"loss": 0.3337, "grad_norm": 0.8443069458007812, "learning_rate": 0.0002, "epoch": 5.982800982800983, "step": 4870}, {"loss": 0.3351, "grad_norm": 0.8711344003677368, "learning_rate": 0.0002, "epoch": 5.995085995085995, "step": 4880}, {"eval_loss": 0.3778059184551239, "eval_runtime": 20.6858, "eval_samples_per_second": 16.001, "eval_steps_per_second": 2.03, "epoch": 6.0, "step": 4884}, {"loss": 0.3244, "grad_norm": 0.7697948813438416, "learning_rate": 0.0002, "epoch": 6.007371007371007, "step": 4890}, {"loss": 0.3118, "grad_norm": 0.7734108567237854, "learning_rate": 0.0002, "epoch": 6.019656019656019, "step": 4900}, {"loss": 0.3242, "grad_norm": 0.7173922657966614, "learning_rate": 0.0002, "epoch": 6.031941031941032, "step": 4910}, {"loss": 0.3159, "grad_norm": 1.062118649482727, "learning_rate": 0.0002, "epoch": 6.044226044226044, "step": 4920}, {"loss": 0.3361, "grad_norm": 0.746422529220581, "learning_rate": 0.0002, "epoch": 6.056511056511057, "step": 4930}, {"loss": 0.3204, "grad_norm": 0.8549448251724243, "learning_rate": 0.0002, "epoch": 6.068796068796069, "step": 4940}, {"loss": 0.3236, "grad_norm": 0.9405432939529419, "learning_rate": 0.0002, "epoch": 6.081081081081081, "step": 4950}, {"loss": 0.3278, "grad_norm": 0.752382755279541, "learning_rate": 0.0002, "epoch": 6.093366093366093, "step": 4960}, {"loss": 0.3204, "grad_norm": 0.820332407951355, "learning_rate": 0.0002, "epoch": 6.105651105651106, "step": 4970}, {"loss": 0.3192, "grad_norm": 0.8701449036598206, "learning_rate": 0.0002, "epoch": 6.117936117936118, "step": 4980}, {"loss": 0.321, "grad_norm": 0.8192865252494812, "learning_rate": 0.0002, "epoch": 6.1302211302211305, "step": 4990}, {"loss": 0.3295, "grad_norm": 1.0016303062438965, "learning_rate": 0.0002, "epoch": 6.142506142506143, "step": 5000}, {"loss": 0.3352, "grad_norm": 0.9194409251213074, "learning_rate": 0.0002, "epoch": 6.1547911547911545, "step": 5010}, {"loss": 0.3205, "grad_norm": 0.9319757223129272, "learning_rate": 0.0002, "epoch": 6.167076167076167, "step": 5020}, {"loss": 0.3256, "grad_norm": 0.8737656474113464, "learning_rate": 0.0002, "epoch": 6.179361179361179, "step": 5030}, {"loss": 0.3221, "grad_norm": 0.8736537098884583, "learning_rate": 0.0002, "epoch": 6.191646191646192, "step": 5040}, {"loss": 0.3265, "grad_norm": 0.9301430583000183, "learning_rate": 0.0002, "epoch": 6.203931203931204, "step": 5050}, {"loss": 0.3285, "grad_norm": 0.7717130780220032, "learning_rate": 0.0002, "epoch": 6.216216216216216, "step": 5060}, {"loss": 0.3192, "grad_norm": 0.6709604859352112, "learning_rate": 0.0002, "epoch": 6.228501228501228, "step": 5070}, {"loss": 0.3352, "grad_norm": 0.879374086856842, "learning_rate": 0.0002, "epoch": 6.240786240786241, "step": 5080}, {"loss": 0.329, "grad_norm": 0.9136955738067627, "learning_rate": 0.0002, "epoch": 6.253071253071253, "step": 5090}, {"loss": 0.3228, "grad_norm": 0.795177161693573, "learning_rate": 0.0002, "epoch": 6.2653562653562656, "step": 5100}, {"loss": 0.3273, "grad_norm": 1.0412259101867676, "learning_rate": 0.0002, "epoch": 6.277641277641278, "step": 5110}, {"loss": 0.3221, "grad_norm": 0.7382524013519287, "learning_rate": 0.0002, "epoch": 6.2899262899262895, "step": 5120}, {"loss": 0.3102, "grad_norm": 0.8818480968475342, "learning_rate": 0.0002, "epoch": 6.302211302211302, "step": 5130}, {"loss": 0.3316, "grad_norm": 0.7865153551101685, "learning_rate": 0.0002, "epoch": 6.314496314496314, "step": 5140}, {"loss": 0.3264, "grad_norm": 0.9166486859321594, "learning_rate": 0.0002, "epoch": 6.326781326781327, "step": 5150}, {"loss": 0.33, "grad_norm": 0.6655149459838867, "learning_rate": 0.0002, "epoch": 6.339066339066339, "step": 5160}, {"loss": 0.3359, "grad_norm": 0.7762818336486816, "learning_rate": 0.0002, "epoch": 6.351351351351352, "step": 5170}, {"loss": 0.3244, "grad_norm": 0.8057235479354858, "learning_rate": 0.0002, "epoch": 6.363636363636363, "step": 5180}, {"loss": 0.3167, "grad_norm": 0.8186984062194824, "learning_rate": 0.0002, "epoch": 6.375921375921376, "step": 5190}, {"loss": 0.3289, "grad_norm": 0.8669573068618774, "learning_rate": 0.0002, "epoch": 6.388206388206388, "step": 5200}, {"loss": 0.3313, "grad_norm": 0.8904402852058411, "learning_rate": 0.0002, "epoch": 6.400491400491401, "step": 5210}, {"loss": 0.3187, "grad_norm": 0.9250359535217285, "learning_rate": 0.0002, "epoch": 6.412776412776413, "step": 5220}, {"loss": 0.3229, "grad_norm": 0.8718299269676208, "learning_rate": 0.0002, "epoch": 6.4250614250614255, "step": 5230}, {"loss": 0.3214, "grad_norm": 0.8156430125236511, "learning_rate": 0.0002, "epoch": 6.437346437346437, "step": 5240}, {"loss": 0.3244, "grad_norm": 0.7759218215942383, "learning_rate": 0.0002, "epoch": 6.4496314496314495, "step": 5250}, {"loss": 0.3298, "grad_norm": 0.8137310743331909, "learning_rate": 0.0002, "epoch": 6.461916461916462, "step": 5260}, {"loss": 0.3275, "grad_norm": 0.8121917843818665, "learning_rate": 0.0002, "epoch": 6.474201474201474, "step": 5270}, {"loss": 0.3201, "grad_norm": 0.8178010582923889, "learning_rate": 0.0002, "epoch": 6.486486486486487, "step": 5280}, {"loss": 0.3271, "grad_norm": 1.1806302070617676, "learning_rate": 0.0002, "epoch": 6.498771498771498, "step": 5290}, {"loss": 0.3231, "grad_norm": 0.8255127668380737, "learning_rate": 0.0002, "epoch": 6.511056511056511, "step": 5300}, {"loss": 0.3227, "grad_norm": 0.8006690740585327, "learning_rate": 0.0002, "epoch": 6.523341523341523, "step": 5310}, {"loss": 0.3262, "grad_norm": 0.9932374358177185, "learning_rate": 0.0002, "epoch": 6.535626535626536, "step": 5320}, {"loss": 0.3291, "grad_norm": 0.8973969221115112, "learning_rate": 0.0002, "epoch": 6.547911547911548, "step": 5330}, {"loss": 0.3146, "grad_norm": 0.7359915971755981, "learning_rate": 0.0002, "epoch": 6.560196560196561, "step": 5340}, {"loss": 0.3308, "grad_norm": 0.9941133856773376, "learning_rate": 0.0002, "epoch": 6.572481572481572, "step": 5350}, {"loss": 0.3202, "grad_norm": 0.9008874893188477, "learning_rate": 0.0002, "epoch": 6.584766584766585, "step": 5360}, {"loss": 0.3271, "grad_norm": 1.309710144996643, "learning_rate": 0.0002, "epoch": 6.597051597051597, "step": 5370}, {"loss": 0.3177, "grad_norm": 0.797768235206604, "learning_rate": 0.0002, "epoch": 6.6093366093366095, "step": 5380}, {"loss": 0.3218, "grad_norm": 0.8507353663444519, "learning_rate": 0.0002, "epoch": 6.621621621621622, "step": 5390}, {"loss": 0.3204, "grad_norm": 0.9628674983978271, "learning_rate": 0.0002, "epoch": 6.6339066339066335, "step": 5400}, {"loss": 0.3155, "grad_norm": 0.6989983320236206, "learning_rate": 0.0002, "epoch": 6.646191646191646, "step": 5410}, {"loss": 0.3197, "grad_norm": 0.9505863189697266, "learning_rate": 0.0002, "epoch": 6.658476658476658, "step": 5420}, {"loss": 0.3259, "grad_norm": 0.8058171272277832, "learning_rate": 0.0002, "epoch": 6.670761670761671, "step": 5430}, {"loss": 0.3248, "grad_norm": 0.8476499915122986, "learning_rate": 0.0002, "epoch": 6.683046683046683, "step": 5440}, {"loss": 0.326, "grad_norm": 0.8503309488296509, "learning_rate": 0.0002, "epoch": 6.695331695331696, "step": 5450}, {"loss": 0.3218, "grad_norm": 0.919566810131073, "learning_rate": 0.0002, "epoch": 6.707616707616707, "step": 5460}, {"loss": 0.3218, "grad_norm": 0.7741201519966125, "learning_rate": 0.0002, "epoch": 6.71990171990172, "step": 5470}, {"loss": 0.329, "grad_norm": 0.8432701826095581, "learning_rate": 0.0002, "epoch": 6.732186732186732, "step": 5480}, {"loss": 0.3284, "grad_norm": 1.0183148384094238, "learning_rate": 0.0002, "epoch": 6.744471744471745, "step": 5490}, {"loss": 0.3312, "grad_norm": 0.8491143584251404, "learning_rate": 0.0002, "epoch": 6.756756756756757, "step": 5500}, {"loss": 0.3208, "grad_norm": 0.9586310386657715, "learning_rate": 0.0002, "epoch": 6.769041769041769, "step": 5510}, {"loss": 0.3305, "grad_norm": 0.7936097383499146, "learning_rate": 0.0002, "epoch": 6.781326781326781, "step": 5520}, {"loss": 0.318, "grad_norm": 0.7875059247016907, "learning_rate": 0.0002, "epoch": 6.7936117936117935, "step": 5530}, {"loss": 0.3234, "grad_norm": 0.8136157393455505, "learning_rate": 0.0002, "epoch": 6.805896805896806, "step": 5540}, {"loss": 0.3161, "grad_norm": 0.837213933467865, "learning_rate": 0.0002, "epoch": 6.818181818181818, "step": 5550}, {"loss": 0.3153, "grad_norm": 0.6812925338745117, "learning_rate": 0.0002, "epoch": 6.830466830466831, "step": 5560}, {"loss": 0.3139, "grad_norm": 0.7309592962265015, "learning_rate": 0.0002, "epoch": 6.842751842751843, "step": 5570}, {"loss": 0.3126, "grad_norm": 0.6905979514122009, "learning_rate": 0.0002, "epoch": 6.855036855036855, "step": 5580}, {"loss": 0.3291, "grad_norm": 1.1768406629562378, "learning_rate": 0.0002, "epoch": 6.867321867321867, "step": 5590}, {"loss": 0.3193, "grad_norm": 0.7618567943572998, "learning_rate": 0.0002, "epoch": 6.87960687960688, "step": 5600}, {"loss": 0.3296, "grad_norm": 0.7930929660797119, "learning_rate": 0.0002, "epoch": 6.891891891891892, "step": 5610}, {"loss": 0.3241, "grad_norm": 0.7931787371635437, "learning_rate": 0.0002, "epoch": 6.9041769041769046, "step": 5620}, {"loss": 0.3215, "grad_norm": 0.6366972923278809, "learning_rate": 0.0002, "epoch": 6.916461916461916, "step": 5630}, {"loss": 0.3264, "grad_norm": 0.7782737612724304, "learning_rate": 0.0002, "epoch": 6.9287469287469285, "step": 5640}, {"loss": 0.3186, "grad_norm": 0.8643787503242493, "learning_rate": 0.0002, "epoch": 6.941031941031941, "step": 5650}, {"loss": 0.3285, "grad_norm": 1.0843733549118042, "learning_rate": 0.0002, "epoch": 6.953316953316953, "step": 5660}, {"loss": 0.3163, "grad_norm": 0.71319180727005, "learning_rate": 0.0002, "epoch": 6.965601965601966, "step": 5670}, {"loss": 0.3196, "grad_norm": 0.976536750793457, "learning_rate": 0.0002, "epoch": 6.977886977886978, "step": 5680}, {"loss": 0.3255, "grad_norm": 0.9221968054771423, "learning_rate": 0.0002, "epoch": 6.99017199017199, "step": 5690}]} +{"epoch": 8.0, "step": 6512, "epoch_duration": 733.1801314353943, "total_accumulated_duration": 6005.529521942139, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 3020.60888671875}, "peak_memory_usage": {"GPU_0": 15051.17431640625}, "avg_memory_reserved": {"GPU_0": 15256.0}, "peak_memory_reserved": {"GPU_0": 16176.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2b-it_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-15098-sd-1/checkpoint-5698", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.5354, "grad_norm": 0.8178550004959106, "learning_rate": 0.0002, "epoch": 0.012285012285012284, "step": 10}, {"loss": 2.534, "grad_norm": 1.0338047742843628, "learning_rate": 0.0002, "epoch": 0.02457002457002457, "step": 20}, {"loss": 2.1691, "grad_norm": 0.8931729197502136, "learning_rate": 0.0002, "epoch": 0.036855036855036855, "step": 30}, {"loss": 1.8813, "grad_norm": 0.9666458964347839, "learning_rate": 0.0002, "epoch": 0.04914004914004914, "step": 40}, {"loss": 1.6479, "grad_norm": 1.2691702842712402, "learning_rate": 0.0002, "epoch": 0.06142506142506143, "step": 50}, {"loss": 1.3831, "grad_norm": 1.0307111740112305, "learning_rate": 0.0002, "epoch": 0.07371007371007371, "step": 60}, {"loss": 1.2987, "grad_norm": 1.1837389469146729, "learning_rate": 0.0002, "epoch": 0.085995085995086, "step": 70}, {"loss": 1.2325, "grad_norm": 1.1481467485427856, "learning_rate": 0.0002, "epoch": 0.09828009828009827, "step": 80}, {"loss": 1.1425, "grad_norm": 1.0385297536849976, "learning_rate": 0.0002, "epoch": 0.11056511056511056, "step": 90}, {"loss": 1.1177, "grad_norm": 1.125789999961853, "learning_rate": 0.0002, "epoch": 0.12285012285012285, "step": 100}, {"loss": 1.0477, "grad_norm": 0.9630613923072815, "learning_rate": 0.0002, "epoch": 0.13513513513513514, "step": 110}, {"loss": 1.0074, "grad_norm": 1.060392141342163, "learning_rate": 0.0002, "epoch": 0.14742014742014742, "step": 120}, {"loss": 1.0128, "grad_norm": 1.0986546277999878, "learning_rate": 0.0002, "epoch": 0.1597051597051597, "step": 130}, {"loss": 1.0068, "grad_norm": 1.1713459491729736, "learning_rate": 0.0002, "epoch": 0.171990171990172, "step": 140}, {"loss": 0.973, "grad_norm": 1.1548224687576294, "learning_rate": 0.0002, "epoch": 0.18427518427518427, "step": 150}, {"loss": 0.941, "grad_norm": 1.2662502527236938, "learning_rate": 0.0002, "epoch": 0.19656019656019655, "step": 160}, {"loss": 0.8849, "grad_norm": 1.1521110534667969, "learning_rate": 0.0002, "epoch": 0.20884520884520885, "step": 170}, {"loss": 0.8931, "grad_norm": 1.1044857501983643, "learning_rate": 0.0002, "epoch": 0.22113022113022113, "step": 180}, {"loss": 0.9572, "grad_norm": 0.9770650267601013, "learning_rate": 0.0002, "epoch": 0.2334152334152334, "step": 190}, {"loss": 0.881, "grad_norm": 0.9710931777954102, "learning_rate": 0.0002, "epoch": 0.2457002457002457, "step": 200}, {"loss": 0.9205, "grad_norm": 0.9593933820724487, "learning_rate": 0.0002, "epoch": 0.257985257985258, "step": 210}, {"loss": 0.843, "grad_norm": 1.003553032875061, "learning_rate": 0.0002, "epoch": 0.2702702702702703, "step": 220}, {"loss": 0.9032, "grad_norm": 0.9187764525413513, "learning_rate": 0.0002, "epoch": 0.28255528255528256, "step": 230}, {"loss": 0.8572, "grad_norm": 0.9294946789741516, "learning_rate": 0.0002, "epoch": 0.29484029484029484, "step": 240}, {"loss": 0.8856, "grad_norm": 0.9537560939788818, "learning_rate": 0.0002, "epoch": 0.3071253071253071, "step": 250}, {"loss": 0.8546, "grad_norm": 1.00537109375, "learning_rate": 0.0002, "epoch": 0.3194103194103194, "step": 260}, {"loss": 0.896, "grad_norm": 0.8775776028633118, "learning_rate": 0.0002, "epoch": 0.3316953316953317, "step": 270}, {"loss": 0.808, "grad_norm": 0.8316839933395386, "learning_rate": 0.0002, "epoch": 0.343980343980344, "step": 280}, {"loss": 0.8248, "grad_norm": 0.8542073965072632, "learning_rate": 0.0002, "epoch": 0.35626535626535627, "step": 290}, {"loss": 0.8452, "grad_norm": 0.848444402217865, "learning_rate": 0.0002, "epoch": 0.36855036855036855, "step": 300}, {"loss": 0.8253, "grad_norm": 0.9017520546913147, "learning_rate": 0.0002, "epoch": 0.3808353808353808, "step": 310}, {"loss": 0.8098, "grad_norm": 0.7672467231750488, "learning_rate": 0.0002, "epoch": 0.3931203931203931, "step": 320}, {"loss": 0.8478, "grad_norm": 0.9109916687011719, "learning_rate": 0.0002, "epoch": 0.40540540540540543, "step": 330}, {"loss": 0.8041, "grad_norm": 0.8750321269035339, "learning_rate": 0.0002, "epoch": 0.4176904176904177, "step": 340}, {"loss": 0.8158, "grad_norm": 0.7911098599433899, "learning_rate": 0.0002, "epoch": 0.42997542997543, "step": 350}, {"loss": 0.8001, "grad_norm": 0.871601402759552, "learning_rate": 0.0002, "epoch": 0.44226044226044225, "step": 360}, {"loss": 0.8187, "grad_norm": 0.9393917918205261, "learning_rate": 0.0002, "epoch": 0.45454545454545453, "step": 370}, {"loss": 0.8124, "grad_norm": 0.8260403275489807, "learning_rate": 0.0002, "epoch": 0.4668304668304668, "step": 380}, {"loss": 0.7768, "grad_norm": 0.9792159199714661, "learning_rate": 0.0002, "epoch": 0.47911547911547914, "step": 390}, {"loss": 0.7981, "grad_norm": 0.9943315982818604, "learning_rate": 0.0002, "epoch": 0.4914004914004914, "step": 400}, {"loss": 0.7765, "grad_norm": 0.8999950885772705, "learning_rate": 0.0002, "epoch": 0.5036855036855037, "step": 410}, {"loss": 0.7807, "grad_norm": 0.8348393440246582, "learning_rate": 0.0002, "epoch": 0.515970515970516, "step": 420}, {"loss": 0.8269, "grad_norm": 0.7371744513511658, "learning_rate": 0.0002, "epoch": 0.5282555282555282, "step": 430}, {"loss": 0.8181, "grad_norm": 0.8354107141494751, "learning_rate": 0.0002, "epoch": 0.5405405405405406, "step": 440}, {"loss": 0.7849, "grad_norm": 0.8553793430328369, "learning_rate": 0.0002, "epoch": 0.5528255528255528, "step": 450}, {"loss": 0.8098, "grad_norm": 1.0762015581130981, "learning_rate": 0.0002, "epoch": 0.5651105651105651, "step": 460}, {"loss": 0.7942, "grad_norm": 0.8350747227668762, "learning_rate": 0.0002, "epoch": 0.5773955773955773, "step": 470}, {"loss": 0.7922, "grad_norm": 0.7819945216178894, "learning_rate": 0.0002, "epoch": 0.5896805896805897, "step": 480}, {"loss": 0.7845, "grad_norm": 0.8079741597175598, "learning_rate": 0.0002, "epoch": 0.601965601965602, "step": 490}, {"loss": 0.7417, "grad_norm": 0.776435911655426, "learning_rate": 0.0002, "epoch": 0.6142506142506142, "step": 500}, {"loss": 0.7855, "grad_norm": 0.7646855115890503, "learning_rate": 0.0002, "epoch": 0.6265356265356266, "step": 510}, {"loss": 0.7923, "grad_norm": 0.786396861076355, "learning_rate": 0.0002, "epoch": 0.6388206388206388, "step": 520}, {"loss": 0.7624, "grad_norm": 0.7016594409942627, "learning_rate": 0.0002, "epoch": 0.6511056511056511, "step": 530}, {"loss": 0.786, "grad_norm": 0.8060444593429565, "learning_rate": 0.0002, "epoch": 0.6633906633906634, "step": 540}, {"loss": 0.7417, "grad_norm": 0.9087467789649963, "learning_rate": 0.0002, "epoch": 0.6756756756756757, "step": 550}, {"loss": 0.7591, "grad_norm": 0.8149628639221191, "learning_rate": 0.0002, "epoch": 0.687960687960688, "step": 560}, {"loss": 0.8004, "grad_norm": 0.7493641972541809, "learning_rate": 0.0002, "epoch": 0.7002457002457002, "step": 570}, {"loss": 0.765, "grad_norm": 0.7958765625953674, "learning_rate": 0.0002, "epoch": 0.7125307125307125, "step": 580}, {"loss": 0.7276, "grad_norm": 0.7917273640632629, "learning_rate": 0.0002, "epoch": 0.7248157248157249, "step": 590}, {"loss": 0.758, "grad_norm": 0.8040468692779541, "learning_rate": 0.0002, "epoch": 0.7371007371007371, "step": 600}, {"loss": 0.735, "grad_norm": 0.8696851134300232, "learning_rate": 0.0002, "epoch": 0.7493857493857494, "step": 610}, {"loss": 0.7321, "grad_norm": 0.8418059945106506, "learning_rate": 0.0002, "epoch": 0.7616707616707616, "step": 620}, {"loss": 0.7395, "grad_norm": 0.7754243612289429, "learning_rate": 0.0002, "epoch": 0.773955773955774, "step": 630}, {"loss": 0.7679, "grad_norm": 0.7639613747596741, "learning_rate": 0.0002, "epoch": 0.7862407862407862, "step": 640}, {"loss": 0.7159, "grad_norm": 0.7516646385192871, "learning_rate": 0.0002, "epoch": 0.7985257985257985, "step": 650}, {"loss": 0.7349, "grad_norm": 0.7840844988822937, "learning_rate": 0.0002, "epoch": 0.8108108108108109, "step": 660}, {"loss": 0.7264, "grad_norm": 0.7657070755958557, "learning_rate": 0.0002, "epoch": 0.8230958230958231, "step": 670}, {"loss": 0.7369, "grad_norm": 0.7711591720581055, "learning_rate": 0.0002, "epoch": 0.8353808353808354, "step": 680}, {"loss": 0.759, "grad_norm": 0.8026325106620789, "learning_rate": 0.0002, "epoch": 0.8476658476658476, "step": 690}, {"loss": 0.737, "grad_norm": 0.7902713418006897, "learning_rate": 0.0002, "epoch": 0.85995085995086, "step": 700}, {"loss": 0.7349, "grad_norm": 0.8212456107139587, "learning_rate": 0.0002, "epoch": 0.8722358722358723, "step": 710}, {"loss": 0.7661, "grad_norm": 0.7867200970649719, "learning_rate": 0.0002, "epoch": 0.8845208845208845, "step": 720}, {"loss": 0.7195, "grad_norm": 0.80084627866745, "learning_rate": 0.0002, "epoch": 0.8968058968058968, "step": 730}, {"loss": 0.7641, "grad_norm": 0.7203794121742249, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 740}, {"loss": 0.7134, "grad_norm": 0.7598419785499573, "learning_rate": 0.0002, "epoch": 0.9213759213759214, "step": 750}, {"loss": 0.7208, "grad_norm": 0.7787027359008789, "learning_rate": 0.0002, "epoch": 0.9336609336609336, "step": 760}, {"loss": 0.7119, "grad_norm": 0.8444012403488159, "learning_rate": 0.0002, "epoch": 0.9459459459459459, "step": 770}, {"loss": 0.7099, "grad_norm": 0.7388550639152527, "learning_rate": 0.0002, "epoch": 0.9582309582309583, "step": 780}, {"loss": 0.7184, "grad_norm": 0.7379167079925537, "learning_rate": 0.0002, "epoch": 0.9705159705159705, "step": 790}, {"loss": 0.7143, "grad_norm": 0.8291640281677246, "learning_rate": 0.0002, "epoch": 0.9828009828009828, "step": 800}, {"loss": 0.6972, "grad_norm": 0.7415094375610352, "learning_rate": 0.0002, "epoch": 0.995085995085995, "step": 810}, {"eval_loss": 0.703994870185852, "eval_runtime": 20.2182, "eval_samples_per_second": 16.371, "eval_steps_per_second": 2.077, "epoch": 1.0, "step": 814}, {"loss": 0.6959, "grad_norm": 0.7405961751937866, "learning_rate": 0.0002, "epoch": 1.0073710073710074, "step": 820}, {"loss": 0.6706, "grad_norm": 0.8534344434738159, "learning_rate": 0.0002, "epoch": 1.0196560196560196, "step": 830}, {"loss": 0.6719, "grad_norm": 0.7415764331817627, "learning_rate": 0.0002, "epoch": 1.031941031941032, "step": 840}, {"loss": 0.6673, "grad_norm": 0.74293053150177, "learning_rate": 0.0002, "epoch": 1.0442260442260443, "step": 850}, {"loss": 0.6897, "grad_norm": 0.697727382183075, "learning_rate": 0.0002, "epoch": 1.0565110565110565, "step": 860}, {"loss": 0.6566, "grad_norm": 0.8022570013999939, "learning_rate": 0.0002, "epoch": 1.0687960687960687, "step": 870}, {"loss": 0.6759, "grad_norm": 0.7545800805091858, "learning_rate": 0.0002, "epoch": 1.0810810810810811, "step": 880}, {"loss": 0.6397, "grad_norm": 0.8005648255348206, "learning_rate": 0.0002, "epoch": 1.0933660933660934, "step": 890}, {"loss": 0.6499, "grad_norm": 0.7681778073310852, "learning_rate": 0.0002, "epoch": 1.1056511056511056, "step": 900}, {"loss": 0.6672, "grad_norm": 0.7822468876838684, "learning_rate": 0.0002, "epoch": 1.117936117936118, "step": 910}, {"loss": 0.6492, "grad_norm": 0.8324839472770691, "learning_rate": 0.0002, "epoch": 1.1302211302211302, "step": 920}, {"loss": 0.6659, "grad_norm": 0.8206289410591125, "learning_rate": 0.0002, "epoch": 1.1425061425061425, "step": 930}, {"loss": 0.6385, "grad_norm": 0.786461591720581, "learning_rate": 0.0002, "epoch": 1.154791154791155, "step": 940}, {"loss": 0.6493, "grad_norm": 0.8288539052009583, "learning_rate": 0.0002, "epoch": 1.1670761670761671, "step": 950}, {"loss": 0.6818, "grad_norm": 0.7566865682601929, "learning_rate": 0.0002, "epoch": 1.1793611793611793, "step": 960}, {"loss": 0.6597, "grad_norm": 0.7761894464492798, "learning_rate": 0.0002, "epoch": 1.1916461916461916, "step": 970}, {"loss": 0.6403, "grad_norm": 0.7608440518379211, "learning_rate": 0.0002, "epoch": 1.203931203931204, "step": 980}, {"loss": 0.7041, "grad_norm": 0.799745500087738, "learning_rate": 0.0002, "epoch": 1.2162162162162162, "step": 990}, {"loss": 0.6358, "grad_norm": 0.8135330677032471, "learning_rate": 0.0002, "epoch": 1.2285012285012284, "step": 1000}, {"loss": 0.6496, "grad_norm": 0.7410391569137573, "learning_rate": 0.0002, "epoch": 1.2407862407862407, "step": 1010}, {"loss": 0.63, "grad_norm": 0.7826172709465027, "learning_rate": 0.0002, "epoch": 1.253071253071253, "step": 1020}, {"loss": 0.6582, "grad_norm": 0.7210677862167358, "learning_rate": 0.0002, "epoch": 1.2653562653562653, "step": 1030}, {"loss": 0.6609, "grad_norm": 0.7571766972541809, "learning_rate": 0.0002, "epoch": 1.2776412776412776, "step": 1040}, {"loss": 0.6315, "grad_norm": 0.8602666258811951, "learning_rate": 0.0002, "epoch": 1.28992628992629, "step": 1050}, {"loss": 0.6825, "grad_norm": 0.8640648722648621, "learning_rate": 0.0002, "epoch": 1.3022113022113022, "step": 1060}, {"loss": 0.6563, "grad_norm": 0.7289374470710754, "learning_rate": 0.0002, "epoch": 1.3144963144963144, "step": 1070}, {"loss": 0.629, "grad_norm": 0.8099908828735352, "learning_rate": 0.0002, "epoch": 1.3267813267813269, "step": 1080}, {"loss": 0.6882, "grad_norm": 0.8623505234718323, "learning_rate": 0.0002, "epoch": 1.339066339066339, "step": 1090}, {"loss": 0.6368, "grad_norm": 0.900576114654541, "learning_rate": 0.0002, "epoch": 1.3513513513513513, "step": 1100}, {"loss": 0.6398, "grad_norm": 0.729603111743927, "learning_rate": 0.0002, "epoch": 1.3636363636363638, "step": 1110}, {"loss": 0.6619, "grad_norm": 0.8350434303283691, "learning_rate": 0.0002, "epoch": 1.375921375921376, "step": 1120}, {"loss": 0.6447, "grad_norm": 0.8049437999725342, "learning_rate": 0.0002, "epoch": 1.3882063882063882, "step": 1130}, {"loss": 0.6336, "grad_norm": 0.8222764134407043, "learning_rate": 0.0002, "epoch": 1.4004914004914004, "step": 1140}, {"loss": 0.6453, "grad_norm": 0.7949751019477844, "learning_rate": 0.0002, "epoch": 1.4127764127764126, "step": 1150}, {"loss": 0.6246, "grad_norm": 0.8375639915466309, "learning_rate": 0.0002, "epoch": 1.425061425061425, "step": 1160}, {"loss": 0.6358, "grad_norm": 0.7261053919792175, "learning_rate": 0.0002, "epoch": 1.4373464373464373, "step": 1170}, {"loss": 0.6709, "grad_norm": 0.6918320655822754, "learning_rate": 0.0002, "epoch": 1.4496314496314495, "step": 1180}, {"loss": 0.598, "grad_norm": 0.8148727416992188, "learning_rate": 0.0002, "epoch": 1.461916461916462, "step": 1190}, {"loss": 0.6269, "grad_norm": 0.7014724612236023, "learning_rate": 0.0002, "epoch": 1.4742014742014742, "step": 1200}, {"loss": 0.617, "grad_norm": 0.8110846281051636, "learning_rate": 0.0002, "epoch": 1.4864864864864864, "step": 1210}, {"loss": 0.6633, "grad_norm": 0.8336407542228699, "learning_rate": 0.0002, "epoch": 1.4987714987714988, "step": 1220}, {"loss": 0.6028, "grad_norm": 0.826996386051178, "learning_rate": 0.0002, "epoch": 1.511056511056511, "step": 1230}, {"loss": 0.6464, "grad_norm": 0.7503120303153992, "learning_rate": 0.0002, "epoch": 1.5233415233415233, "step": 1240}, {"loss": 0.6418, "grad_norm": 0.8297192454338074, "learning_rate": 0.0002, "epoch": 1.5356265356265357, "step": 1250}, {"loss": 0.6466, "grad_norm": 0.7585996985435486, "learning_rate": 0.0002, "epoch": 1.547911547911548, "step": 1260}, {"loss": 0.6196, "grad_norm": 0.7530493140220642, "learning_rate": 0.0002, "epoch": 1.5601965601965602, "step": 1270}, {"loss": 0.6252, "grad_norm": 0.8141939640045166, "learning_rate": 0.0002, "epoch": 1.5724815724815726, "step": 1280}, {"loss": 0.6441, "grad_norm": 0.6959931254386902, "learning_rate": 0.0002, "epoch": 1.5847665847665846, "step": 1290}, {"loss": 0.6542, "grad_norm": 0.8677428364753723, "learning_rate": 0.0002, "epoch": 1.597051597051597, "step": 1300}, {"loss": 0.633, "grad_norm": 0.8527476787567139, "learning_rate": 0.0002, "epoch": 1.6093366093366095, "step": 1310}, {"loss": 0.6393, "grad_norm": 0.8462157845497131, "learning_rate": 0.0002, "epoch": 1.6216216216216215, "step": 1320}, {"loss": 0.6265, "grad_norm": 0.9371153712272644, "learning_rate": 0.0002, "epoch": 1.633906633906634, "step": 1330}, {"loss": 0.5952, "grad_norm": 0.8408344984054565, "learning_rate": 0.0002, "epoch": 1.6461916461916462, "step": 1340}, {"loss": 0.599, "grad_norm": 0.8391859531402588, "learning_rate": 0.0002, "epoch": 1.6584766584766584, "step": 1350}, {"loss": 0.6313, "grad_norm": 0.7630598545074463, "learning_rate": 0.0002, "epoch": 1.6707616707616708, "step": 1360}, {"loss": 0.5989, "grad_norm": 0.8007895350456238, "learning_rate": 0.0002, "epoch": 1.683046683046683, "step": 1370}, {"loss": 0.6094, "grad_norm": 0.7547900080680847, "learning_rate": 0.0002, "epoch": 1.6953316953316953, "step": 1380}, {"loss": 0.6335, "grad_norm": 0.7779742479324341, "learning_rate": 0.0002, "epoch": 1.7076167076167077, "step": 1390}, {"loss": 0.6078, "grad_norm": 0.712293803691864, "learning_rate": 0.0002, "epoch": 1.71990171990172, "step": 1400}, {"loss": 0.608, "grad_norm": 0.8503297567367554, "learning_rate": 0.0002, "epoch": 1.7321867321867321, "step": 1410}, {"loss": 0.6055, "grad_norm": 0.8312245607376099, "learning_rate": 0.0002, "epoch": 1.7444717444717446, "step": 1420}, {"loss": 0.5978, "grad_norm": 0.7758049368858337, "learning_rate": 0.0002, "epoch": 1.7567567567567568, "step": 1430}, {"loss": 0.5822, "grad_norm": 0.8695956468582153, "learning_rate": 0.0002, "epoch": 1.769041769041769, "step": 1440}, {"loss": 0.5955, "grad_norm": 0.7785261273384094, "learning_rate": 0.0002, "epoch": 1.7813267813267815, "step": 1450}, {"loss": 0.6177, "grad_norm": 0.7091802358627319, "learning_rate": 0.0002, "epoch": 1.7936117936117935, "step": 1460}, {"loss": 0.5811, "grad_norm": 0.774146556854248, "learning_rate": 0.0002, "epoch": 1.805896805896806, "step": 1470}, {"loss": 0.5833, "grad_norm": 0.8342524170875549, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 1480}, {"loss": 0.634, "grad_norm": 0.8087738156318665, "learning_rate": 0.0002, "epoch": 1.8304668304668303, "step": 1490}, {"loss": 0.5961, "grad_norm": 0.9830479621887207, "learning_rate": 0.0002, "epoch": 1.8427518427518428, "step": 1500}, {"loss": 0.6211, "grad_norm": 0.8537567853927612, "learning_rate": 0.0002, "epoch": 1.855036855036855, "step": 1510}, {"loss": 0.5767, "grad_norm": 0.8004562854766846, "learning_rate": 0.0002, "epoch": 1.8673218673218672, "step": 1520}, {"loss": 0.604, "grad_norm": 0.8161284327507019, "learning_rate": 0.0002, "epoch": 1.8796068796068797, "step": 1530}, {"loss": 0.5808, "grad_norm": 0.8688093423843384, "learning_rate": 0.0002, "epoch": 1.8918918918918919, "step": 1540}, {"loss": 0.5663, "grad_norm": 0.8287379741668701, "learning_rate": 0.0002, "epoch": 1.904176904176904, "step": 1550}, {"loss": 0.5963, "grad_norm": 0.8050342202186584, "learning_rate": 0.0002, "epoch": 1.9164619164619165, "step": 1560}, {"loss": 0.5837, "grad_norm": 0.9273895621299744, "learning_rate": 0.0002, "epoch": 1.9287469287469288, "step": 1570}, {"loss": 0.5945, "grad_norm": 0.8416891694068909, "learning_rate": 0.0002, "epoch": 1.941031941031941, "step": 1580}, {"loss": 0.5838, "grad_norm": 0.7299820184707642, "learning_rate": 0.0002, "epoch": 1.9533169533169534, "step": 1590}, {"loss": 0.6025, "grad_norm": 0.7262272834777832, "learning_rate": 0.0002, "epoch": 1.9656019656019657, "step": 1600}, {"loss": 0.5873, "grad_norm": 0.8649004697799683, "learning_rate": 0.0002, "epoch": 1.9778869778869779, "step": 1610}, {"loss": 0.5764, "grad_norm": 0.8165444731712341, "learning_rate": 0.0002, "epoch": 1.9901719901719903, "step": 1620}, {"eval_loss": 0.5858802795410156, "eval_runtime": 22.6585, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.854, "epoch": 2.0, "step": 1628}, {"loss": 0.5803, "grad_norm": 0.8142582178115845, "learning_rate": 0.0002, "epoch": 2.0024570024570023, "step": 1630}, {"loss": 0.5499, "grad_norm": 1.0637224912643433, "learning_rate": 0.0002, "epoch": 2.0147420147420148, "step": 1640}, {"loss": 0.5556, "grad_norm": 0.8923280239105225, "learning_rate": 0.0002, "epoch": 2.027027027027027, "step": 1650}, {"loss": 0.5373, "grad_norm": 0.8169175386428833, "learning_rate": 0.0002, "epoch": 2.039312039312039, "step": 1660}, {"loss": 0.552, "grad_norm": 0.8124040365219116, "learning_rate": 0.0002, "epoch": 2.0515970515970516, "step": 1670}, {"loss": 0.5259, "grad_norm": 0.9228773713111877, "learning_rate": 0.0002, "epoch": 2.063882063882064, "step": 1680}, {"loss": 0.5571, "grad_norm": 0.7216871380805969, "learning_rate": 0.0002, "epoch": 2.076167076167076, "step": 1690}, {"loss": 0.523, "grad_norm": 0.8679503202438354, "learning_rate": 0.0002, "epoch": 2.0884520884520885, "step": 1700}, {"loss": 0.5379, "grad_norm": 0.8627730011940002, "learning_rate": 0.0002, "epoch": 2.100737100737101, "step": 1710}, {"loss": 0.551, "grad_norm": 0.9175152778625488, "learning_rate": 0.0002, "epoch": 2.113022113022113, "step": 1720}, {"loss": 0.5378, "grad_norm": 0.7930372953414917, "learning_rate": 0.0002, "epoch": 2.1253071253071254, "step": 1730}, {"loss": 0.5263, "grad_norm": 0.8370155692100525, "learning_rate": 0.0002, "epoch": 2.1375921375921374, "step": 1740}, {"loss": 0.5419, "grad_norm": 0.9121434688568115, "learning_rate": 0.0002, "epoch": 2.14987714987715, "step": 1750}, {"loss": 0.5499, "grad_norm": 0.8703579306602478, "learning_rate": 0.0002, "epoch": 2.1621621621621623, "step": 1760}, {"loss": 0.5333, "grad_norm": 0.9270512461662292, "learning_rate": 0.0002, "epoch": 2.1744471744471743, "step": 1770}, {"loss": 0.5165, "grad_norm": 0.9372949600219727, "learning_rate": 0.0002, "epoch": 2.1867321867321867, "step": 1780}, {"loss": 0.5327, "grad_norm": 0.8955178260803223, "learning_rate": 0.0002, "epoch": 2.199017199017199, "step": 1790}, {"loss": 0.5356, "grad_norm": 0.846102237701416, "learning_rate": 0.0002, "epoch": 2.211302211302211, "step": 1800}, {"loss": 0.5303, "grad_norm": 0.9186713099479675, "learning_rate": 0.0002, "epoch": 2.2235872235872236, "step": 1810}, {"loss": 0.5223, "grad_norm": 0.7695123553276062, "learning_rate": 0.0002, "epoch": 2.235872235872236, "step": 1820}, {"loss": 0.5161, "grad_norm": 0.7340332865715027, "learning_rate": 0.0002, "epoch": 2.248157248157248, "step": 1830}, {"loss": 0.5327, "grad_norm": 0.8933137655258179, "learning_rate": 0.0002, "epoch": 2.2604422604422605, "step": 1840}, {"loss": 0.5471, "grad_norm": 0.7705038189888, "learning_rate": 0.0002, "epoch": 2.2727272727272725, "step": 1850}, {"loss": 0.5346, "grad_norm": 0.8396083116531372, "learning_rate": 0.0002, "epoch": 2.285012285012285, "step": 1860}, {"loss": 0.5335, "grad_norm": 0.7695736289024353, "learning_rate": 0.0002, "epoch": 2.2972972972972974, "step": 1870}, {"loss": 0.5105, "grad_norm": 0.8535045385360718, "learning_rate": 0.0002, "epoch": 2.30958230958231, "step": 1880}, {"loss": 0.5202, "grad_norm": 0.8549142479896545, "learning_rate": 0.0002, "epoch": 2.321867321867322, "step": 1890}, {"loss": 0.5268, "grad_norm": 0.9124433994293213, "learning_rate": 0.0002, "epoch": 2.3341523341523343, "step": 1900}, {"loss": 0.506, "grad_norm": 0.855523943901062, "learning_rate": 0.0002, "epoch": 2.3464373464373462, "step": 1910}, {"loss": 0.5162, "grad_norm": 0.810878336429596, "learning_rate": 0.0002, "epoch": 2.3587223587223587, "step": 1920}, {"loss": 0.531, "grad_norm": 0.7409024834632874, "learning_rate": 0.0002, "epoch": 2.371007371007371, "step": 1930}, {"loss": 0.5045, "grad_norm": 0.8080927729606628, "learning_rate": 0.0002, "epoch": 2.383292383292383, "step": 1940}, {"loss": 0.5032, "grad_norm": 0.9661469459533691, "learning_rate": 0.0002, "epoch": 2.3955773955773956, "step": 1950}, {"loss": 0.5019, "grad_norm": 0.838766872882843, "learning_rate": 0.0002, "epoch": 2.407862407862408, "step": 1960}, {"loss": 0.5128, "grad_norm": 0.8737491965293884, "learning_rate": 0.0002, "epoch": 2.42014742014742, "step": 1970}, {"loss": 0.5153, "grad_norm": 0.8657792210578918, "learning_rate": 0.0002, "epoch": 2.4324324324324325, "step": 1980}, {"loss": 0.5665, "grad_norm": 0.8883858919143677, "learning_rate": 0.0002, "epoch": 2.444717444717445, "step": 1990}, {"loss": 0.5283, "grad_norm": 0.8647662997245789, "learning_rate": 0.0002, "epoch": 2.457002457002457, "step": 2000}, {"loss": 0.518, "grad_norm": 0.896037757396698, "learning_rate": 0.0002, "epoch": 2.4692874692874693, "step": 2010}, {"loss": 0.5245, "grad_norm": 0.8079167008399963, "learning_rate": 0.0002, "epoch": 2.4815724815724813, "step": 2020}, {"loss": 0.5311, "grad_norm": 1.0293292999267578, "learning_rate": 0.0002, "epoch": 2.493857493857494, "step": 2030}, {"loss": 0.5091, "grad_norm": 0.8459244966506958, "learning_rate": 0.0002, "epoch": 2.506142506142506, "step": 2040}, {"loss": 0.4922, "grad_norm": 0.9244982600212097, "learning_rate": 0.0002, "epoch": 2.5184275184275187, "step": 2050}, {"loss": 0.5006, "grad_norm": 0.8245007991790771, "learning_rate": 0.0002, "epoch": 2.5307125307125307, "step": 2060}, {"loss": 0.5229, "grad_norm": 0.8869297504425049, "learning_rate": 0.0002, "epoch": 2.542997542997543, "step": 2070}, {"loss": 0.5097, "grad_norm": 0.8620884418487549, "learning_rate": 0.0002, "epoch": 2.555282555282555, "step": 2080}, {"loss": 0.5239, "grad_norm": 0.8387904167175293, "learning_rate": 0.0002, "epoch": 2.5675675675675675, "step": 2090}, {"loss": 0.4974, "grad_norm": 0.8353935480117798, "learning_rate": 0.0002, "epoch": 2.57985257985258, "step": 2100}, {"loss": 0.5038, "grad_norm": 1.0136934518814087, "learning_rate": 0.0002, "epoch": 2.592137592137592, "step": 2110}, {"loss": 0.513, "grad_norm": 0.9387392997741699, "learning_rate": 0.0002, "epoch": 2.6044226044226044, "step": 2120}, {"loss": 0.4971, "grad_norm": 0.898697555065155, "learning_rate": 0.0002, "epoch": 2.616707616707617, "step": 2130}, {"loss": 0.4981, "grad_norm": 1.0145231485366821, "learning_rate": 0.0002, "epoch": 2.628992628992629, "step": 2140}, {"loss": 0.5151, "grad_norm": 0.8335273265838623, "learning_rate": 0.0002, "epoch": 2.6412776412776413, "step": 2150}, {"loss": 0.5129, "grad_norm": 1.0198529958724976, "learning_rate": 0.0002, "epoch": 2.6535626535626538, "step": 2160}, {"loss": 0.5156, "grad_norm": 0.8353323340415955, "learning_rate": 0.0002, "epoch": 2.6658476658476657, "step": 2170}, {"loss": 0.4818, "grad_norm": 0.8831406831741333, "learning_rate": 0.0002, "epoch": 2.678132678132678, "step": 2180}, {"loss": 0.4858, "grad_norm": 0.7182748913764954, "learning_rate": 0.0002, "epoch": 2.69041769041769, "step": 2190}, {"loss": 0.53, "grad_norm": 0.7892552614212036, "learning_rate": 0.0002, "epoch": 2.7027027027027026, "step": 2200}, {"loss": 0.5101, "grad_norm": 1.0144033432006836, "learning_rate": 0.0002, "epoch": 2.714987714987715, "step": 2210}, {"loss": 0.4909, "grad_norm": 1.0913645029067993, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 2220}, {"loss": 0.5069, "grad_norm": 1.014394998550415, "learning_rate": 0.0002, "epoch": 2.7395577395577395, "step": 2230}, {"loss": 0.4985, "grad_norm": 0.8118020296096802, "learning_rate": 0.0002, "epoch": 2.751842751842752, "step": 2240}, {"loss": 0.5088, "grad_norm": 0.9027737379074097, "learning_rate": 0.0002, "epoch": 2.764127764127764, "step": 2250}, {"loss": 0.5027, "grad_norm": 0.8017747402191162, "learning_rate": 0.0002, "epoch": 2.7764127764127764, "step": 2260}, {"loss": 0.4957, "grad_norm": 0.788362979888916, "learning_rate": 0.0002, "epoch": 2.788697788697789, "step": 2270}, {"loss": 0.5047, "grad_norm": 0.8338918089866638, "learning_rate": 0.0002, "epoch": 2.800982800982801, "step": 2280}, {"loss": 0.4925, "grad_norm": 0.8773167729377747, "learning_rate": 0.0002, "epoch": 2.8132678132678133, "step": 2290}, {"loss": 0.4806, "grad_norm": 0.9319674372673035, "learning_rate": 0.0002, "epoch": 2.8255528255528253, "step": 2300}, {"loss": 0.4815, "grad_norm": 0.8632726073265076, "learning_rate": 0.0002, "epoch": 2.8378378378378377, "step": 2310}, {"loss": 0.4842, "grad_norm": 0.785464882850647, "learning_rate": 0.0002, "epoch": 2.85012285012285, "step": 2320}, {"loss": 0.4867, "grad_norm": 0.8159732818603516, "learning_rate": 0.0002, "epoch": 2.8624078624078626, "step": 2330}, {"loss": 0.4796, "grad_norm": 0.8702368140220642, "learning_rate": 0.0002, "epoch": 2.8746928746928746, "step": 2340}, {"loss": 0.474, "grad_norm": 1.0456738471984863, "learning_rate": 0.0002, "epoch": 2.886977886977887, "step": 2350}, {"loss": 0.4934, "grad_norm": 1.0855203866958618, "learning_rate": 0.0002, "epoch": 2.899262899262899, "step": 2360}, {"loss": 0.4758, "grad_norm": 0.9378156065940857, "learning_rate": 0.0002, "epoch": 2.9115479115479115, "step": 2370}, {"loss": 0.4831, "grad_norm": 0.7390182018280029, "learning_rate": 0.0002, "epoch": 2.923832923832924, "step": 2380}, {"loss": 0.5066, "grad_norm": 0.7667133212089539, "learning_rate": 0.0002, "epoch": 2.9361179361179364, "step": 2390}, {"loss": 0.4722, "grad_norm": 0.8633476495742798, "learning_rate": 0.0002, "epoch": 2.9484029484029484, "step": 2400}, {"loss": 0.4993, "grad_norm": 1.0821104049682617, "learning_rate": 0.0002, "epoch": 2.960687960687961, "step": 2410}, {"loss": 0.4882, "grad_norm": 0.8911418914794922, "learning_rate": 0.0002, "epoch": 2.972972972972973, "step": 2420}, {"loss": 0.4819, "grad_norm": 0.8791135549545288, "learning_rate": 0.0002, "epoch": 2.9852579852579852, "step": 2430}, {"loss": 0.4875, "grad_norm": 0.8066530823707581, "learning_rate": 0.0002, "epoch": 2.9975429975429977, "step": 2440}, {"eval_loss": 0.49752503633499146, "eval_runtime": 20.2911, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.07, "epoch": 3.0, "step": 2442}, {"loss": 0.4362, "grad_norm": 0.7644656896591187, "learning_rate": 0.0002, "epoch": 3.0098280098280097, "step": 2450}, {"loss": 0.4363, "grad_norm": 0.9077525734901428, "learning_rate": 0.0002, "epoch": 3.022113022113022, "step": 2460}, {"loss": 0.422, "grad_norm": 0.7859287261962891, "learning_rate": 0.0002, "epoch": 3.0343980343980346, "step": 2470}, {"loss": 0.4574, "grad_norm": 1.1200323104858398, "learning_rate": 0.0002, "epoch": 3.0466830466830466, "step": 2480}, {"loss": 0.4519, "grad_norm": 0.7570453882217407, "learning_rate": 0.0002, "epoch": 3.058968058968059, "step": 2490}, {"loss": 0.4351, "grad_norm": 0.9450915455818176, "learning_rate": 0.0002, "epoch": 3.0712530712530715, "step": 2500}, {"loss": 0.4343, "grad_norm": 0.8303545117378235, "learning_rate": 0.0002, "epoch": 3.0835380835380835, "step": 2510}, {"loss": 0.4308, "grad_norm": 0.8864443898200989, "learning_rate": 0.0002, "epoch": 3.095823095823096, "step": 2520}, {"loss": 0.4601, "grad_norm": 0.945324718952179, "learning_rate": 0.0002, "epoch": 3.108108108108108, "step": 2530}, {"loss": 0.4345, "grad_norm": 1.0562494993209839, "learning_rate": 0.0002, "epoch": 3.1203931203931203, "step": 2540}, {"loss": 0.4375, "grad_norm": 0.8607500195503235, "learning_rate": 0.0002, "epoch": 3.1326781326781328, "step": 2550}, {"loss": 0.456, "grad_norm": 0.8719640374183655, "learning_rate": 0.0002, "epoch": 3.1449631449631448, "step": 2560}, {"loss": 0.4469, "grad_norm": 0.8647059202194214, "learning_rate": 0.0002, "epoch": 3.157248157248157, "step": 2570}, {"loss": 0.4483, "grad_norm": 0.8346507549285889, "learning_rate": 0.0002, "epoch": 3.1695331695331697, "step": 2580}, {"loss": 0.4331, "grad_norm": 1.0208854675292969, "learning_rate": 0.0002, "epoch": 3.1818181818181817, "step": 2590}, {"loss": 0.435, "grad_norm": 0.7064385414123535, "learning_rate": 0.0002, "epoch": 3.194103194103194, "step": 2600}, {"loss": 0.4541, "grad_norm": 0.927347719669342, "learning_rate": 0.0002, "epoch": 3.2063882063882065, "step": 2610}, {"loss": 0.4561, "grad_norm": 0.943517804145813, "learning_rate": 0.0002, "epoch": 3.2186732186732185, "step": 2620}, {"loss": 0.4225, "grad_norm": 0.7837198376655579, "learning_rate": 0.0002, "epoch": 3.230958230958231, "step": 2630}, {"loss": 0.4494, "grad_norm": 0.7752765417098999, "learning_rate": 0.0002, "epoch": 3.2432432432432434, "step": 2640}, {"loss": 0.4468, "grad_norm": 0.8578953146934509, "learning_rate": 0.0002, "epoch": 3.2555282555282554, "step": 2650}, {"loss": 0.4393, "grad_norm": 1.0209529399871826, "learning_rate": 0.0002, "epoch": 3.267813267813268, "step": 2660}, {"loss": 0.4517, "grad_norm": 0.9069030284881592, "learning_rate": 0.0002, "epoch": 3.2800982800982803, "step": 2670}, {"loss": 0.4262, "grad_norm": 0.8454729318618774, "learning_rate": 0.0002, "epoch": 3.2923832923832923, "step": 2680}, {"loss": 0.4349, "grad_norm": 0.8253099322319031, "learning_rate": 0.0002, "epoch": 3.3046683046683047, "step": 2690}, {"loss": 0.4503, "grad_norm": 0.8765934109687805, "learning_rate": 0.0002, "epoch": 3.3169533169533167, "step": 2700}, {"loss": 0.4518, "grad_norm": 0.8149126172065735, "learning_rate": 0.0002, "epoch": 3.329238329238329, "step": 2710}, {"loss": 0.4437, "grad_norm": 0.8820102214813232, "learning_rate": 0.0002, "epoch": 3.3415233415233416, "step": 2720}, {"loss": 0.4346, "grad_norm": 0.8813952803611755, "learning_rate": 0.0002, "epoch": 3.3538083538083536, "step": 2730}, {"loss": 0.4396, "grad_norm": 1.0338447093963623, "learning_rate": 0.0002, "epoch": 3.366093366093366, "step": 2740}, {"loss": 0.4468, "grad_norm": 0.8780209422111511, "learning_rate": 0.0002, "epoch": 3.3783783783783785, "step": 2750}, {"loss": 0.441, "grad_norm": 0.9017151594161987, "learning_rate": 0.0002, "epoch": 3.3906633906633905, "step": 2760}, {"loss": 0.446, "grad_norm": 0.8647638559341431, "learning_rate": 0.0002, "epoch": 3.402948402948403, "step": 2770}, {"loss": 0.4131, "grad_norm": 0.8298183679580688, "learning_rate": 0.0002, "epoch": 3.4152334152334154, "step": 2780}, {"loss": 0.4406, "grad_norm": 0.9298108816146851, "learning_rate": 0.0002, "epoch": 3.4275184275184274, "step": 2790}, {"loss": 0.4145, "grad_norm": 0.8909980058670044, "learning_rate": 0.0002, "epoch": 3.43980343980344, "step": 2800}, {"loss": 0.4148, "grad_norm": 0.8027496933937073, "learning_rate": 0.0002, "epoch": 3.4520884520884523, "step": 2810}, {"loss": 0.4244, "grad_norm": 0.8766195774078369, "learning_rate": 0.0002, "epoch": 3.4643734643734643, "step": 2820}, {"loss": 0.4292, "grad_norm": 0.8194443583488464, "learning_rate": 0.0002, "epoch": 3.4766584766584767, "step": 2830}, {"loss": 0.4305, "grad_norm": 0.9862873554229736, "learning_rate": 0.0002, "epoch": 3.488943488943489, "step": 2840}, {"loss": 0.4393, "grad_norm": 0.8755377531051636, "learning_rate": 0.0002, "epoch": 3.501228501228501, "step": 2850}, {"loss": 0.4231, "grad_norm": 0.7300266027450562, "learning_rate": 0.0002, "epoch": 3.5135135135135136, "step": 2860}, {"loss": 0.4278, "grad_norm": 0.8342461585998535, "learning_rate": 0.0002, "epoch": 3.5257985257985256, "step": 2870}, {"loss": 0.4395, "grad_norm": 0.8624151349067688, "learning_rate": 0.0002, "epoch": 3.538083538083538, "step": 2880}, {"loss": 0.4064, "grad_norm": 0.8931261301040649, "learning_rate": 0.0002, "epoch": 3.5503685503685505, "step": 2890}, {"loss": 0.4358, "grad_norm": 0.8617086410522461, "learning_rate": 0.0002, "epoch": 3.562653562653563, "step": 2900}, {"loss": 0.419, "grad_norm": 0.8754099607467651, "learning_rate": 0.0002, "epoch": 3.574938574938575, "step": 2910}, {"loss": 0.4275, "grad_norm": 0.8345834612846375, "learning_rate": 0.0002, "epoch": 3.5872235872235874, "step": 2920}, {"loss": 0.4375, "grad_norm": 1.1414062976837158, "learning_rate": 0.0002, "epoch": 3.5995085995085994, "step": 2930}, {"loss": 0.4297, "grad_norm": 0.994860053062439, "learning_rate": 0.0002, "epoch": 3.611793611793612, "step": 2940}, {"loss": 0.4386, "grad_norm": 1.19268000125885, "learning_rate": 0.0002, "epoch": 3.6240786240786242, "step": 2950}, {"loss": 0.4029, "grad_norm": 0.8399543762207031, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 2960}, {"loss": 0.4432, "grad_norm": 0.9873217940330505, "learning_rate": 0.0002, "epoch": 3.6486486486486487, "step": 2970}, {"loss": 0.4308, "grad_norm": 0.9116013646125793, "learning_rate": 0.0002, "epoch": 3.6609336609336607, "step": 2980}, {"loss": 0.4275, "grad_norm": 0.9503833651542664, "learning_rate": 0.0002, "epoch": 3.673218673218673, "step": 2990}, {"loss": 0.4306, "grad_norm": 0.9401112794876099, "learning_rate": 0.0002, "epoch": 3.6855036855036856, "step": 3000}, {"loss": 0.4333, "grad_norm": 1.00745689868927, "learning_rate": 0.0002, "epoch": 3.697788697788698, "step": 3010}, {"loss": 0.432, "grad_norm": 1.0553191900253296, "learning_rate": 0.0002, "epoch": 3.71007371007371, "step": 3020}, {"loss": 0.4321, "grad_norm": 1.0226953029632568, "learning_rate": 0.0002, "epoch": 3.7223587223587224, "step": 3030}, {"loss": 0.418, "grad_norm": 1.085554838180542, "learning_rate": 0.0002, "epoch": 3.7346437346437344, "step": 3040}, {"loss": 0.4196, "grad_norm": 0.9948731064796448, "learning_rate": 0.0002, "epoch": 3.746928746928747, "step": 3050}, {"loss": 0.4281, "grad_norm": 0.9328727126121521, "learning_rate": 0.0002, "epoch": 3.7592137592137593, "step": 3060}, {"loss": 0.4284, "grad_norm": 1.0533266067504883, "learning_rate": 0.0002, "epoch": 3.7714987714987718, "step": 3070}, {"loss": 0.4414, "grad_norm": 0.8213809132575989, "learning_rate": 0.0002, "epoch": 3.7837837837837838, "step": 3080}, {"loss": 0.4348, "grad_norm": 0.8941594362258911, "learning_rate": 0.0002, "epoch": 3.796068796068796, "step": 3090}, {"loss": 0.4266, "grad_norm": 0.8324518203735352, "learning_rate": 0.0002, "epoch": 3.808353808353808, "step": 3100}, {"loss": 0.4227, "grad_norm": 0.8811233639717102, "learning_rate": 0.0002, "epoch": 3.8206388206388207, "step": 3110}, {"loss": 0.4195, "grad_norm": 0.8781470060348511, "learning_rate": 0.0002, "epoch": 3.832923832923833, "step": 3120}, {"loss": 0.4277, "grad_norm": 0.8994116187095642, "learning_rate": 0.0002, "epoch": 3.845208845208845, "step": 3130}, {"loss": 0.4149, "grad_norm": 0.8605017066001892, "learning_rate": 0.0002, "epoch": 3.8574938574938575, "step": 3140}, {"loss": 0.4023, "grad_norm": 0.8966400027275085, "learning_rate": 0.0002, "epoch": 3.8697788697788695, "step": 3150}, {"loss": 0.4245, "grad_norm": 0.8856554627418518, "learning_rate": 0.0002, "epoch": 3.882063882063882, "step": 3160}, {"loss": 0.4101, "grad_norm": 0.8971620798110962, "learning_rate": 0.0002, "epoch": 3.8943488943488944, "step": 3170}, {"loss": 0.3993, "grad_norm": 0.9807813167572021, "learning_rate": 0.0002, "epoch": 3.906633906633907, "step": 3180}, {"loss": 0.4258, "grad_norm": 0.8614121675491333, "learning_rate": 0.0002, "epoch": 3.918918918918919, "step": 3190}, {"loss": 0.4115, "grad_norm": 0.989171028137207, "learning_rate": 0.0002, "epoch": 3.9312039312039313, "step": 3200}, {"loss": 0.4182, "grad_norm": 0.8168872594833374, "learning_rate": 0.0002, "epoch": 3.9434889434889433, "step": 3210}, {"loss": 0.4112, "grad_norm": 0.8109386563301086, "learning_rate": 0.0002, "epoch": 3.9557739557739557, "step": 3220}, {"loss": 0.4165, "grad_norm": 1.0175853967666626, "learning_rate": 0.0002, "epoch": 3.968058968058968, "step": 3230}, {"loss": 0.4146, "grad_norm": 0.936143159866333, "learning_rate": 0.0002, "epoch": 3.98034398034398, "step": 3240}, {"loss": 0.4163, "grad_norm": 0.9557915925979614, "learning_rate": 0.0002, "epoch": 3.9926289926289926, "step": 3250}, {"eval_loss": 0.4401616156101227, "eval_runtime": 20.8047, "eval_samples_per_second": 15.91, "eval_steps_per_second": 2.019, "epoch": 4.0, "step": 3256}, {"loss": 0.408, "grad_norm": 0.7590614557266235, "learning_rate": 0.0002, "epoch": 4.004914004914005, "step": 3260}, {"loss": 0.4001, "grad_norm": 0.8920791149139404, "learning_rate": 0.0002, "epoch": 4.017199017199017, "step": 3270}, {"loss": 0.3789, "grad_norm": 0.8640421628952026, "learning_rate": 0.0002, "epoch": 4.0294840294840295, "step": 3280}, {"loss": 0.3791, "grad_norm": 0.9074113965034485, "learning_rate": 0.0002, "epoch": 4.041769041769042, "step": 3290}, {"loss": 0.3728, "grad_norm": 1.0600885152816772, "learning_rate": 0.0002, "epoch": 4.054054054054054, "step": 3300}, {"loss": 0.3857, "grad_norm": 0.9682773351669312, "learning_rate": 0.0002, "epoch": 4.066339066339066, "step": 3310}, {"loss": 0.4007, "grad_norm": 0.9326395392417908, "learning_rate": 0.0002, "epoch": 4.078624078624078, "step": 3320}, {"loss": 0.3823, "grad_norm": 0.8886597156524658, "learning_rate": 0.0002, "epoch": 4.090909090909091, "step": 3330}, {"loss": 0.3929, "grad_norm": 1.032205581665039, "learning_rate": 0.0002, "epoch": 4.103194103194103, "step": 3340}, {"loss": 0.3836, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.115479115479116, "step": 3350}, {"loss": 0.3866, "grad_norm": 0.8250347971916199, "learning_rate": 0.0002, "epoch": 4.127764127764128, "step": 3360}, {"loss": 0.3826, "grad_norm": 0.7919842600822449, "learning_rate": 0.0002, "epoch": 4.14004914004914, "step": 3370}, {"loss": 0.3838, "grad_norm": 1.045682430267334, "learning_rate": 0.0002, "epoch": 4.152334152334152, "step": 3380}, {"loss": 0.3796, "grad_norm": 0.6873571276664734, "learning_rate": 0.0002, "epoch": 4.164619164619165, "step": 3390}, {"loss": 0.3942, "grad_norm": 1.0227675437927246, "learning_rate": 0.0002, "epoch": 4.176904176904177, "step": 3400}, {"loss": 0.3788, "grad_norm": 0.9167711734771729, "learning_rate": 0.0002, "epoch": 4.1891891891891895, "step": 3410}, {"loss": 0.3792, "grad_norm": 1.0598796606063843, "learning_rate": 0.0002, "epoch": 4.201474201474202, "step": 3420}, {"loss": 0.3955, "grad_norm": 0.8581843972206116, "learning_rate": 0.0002, "epoch": 4.2137592137592135, "step": 3430}, {"loss": 0.3761, "grad_norm": 0.8862360119819641, "learning_rate": 0.0002, "epoch": 4.226044226044226, "step": 3440}, {"loss": 0.3889, "grad_norm": 1.0248323678970337, "learning_rate": 0.0002, "epoch": 4.238329238329238, "step": 3450}, {"loss": 0.3827, "grad_norm": 0.8746261596679688, "learning_rate": 0.0002, "epoch": 4.250614250614251, "step": 3460}, {"loss": 0.3949, "grad_norm": 0.7442536354064941, "learning_rate": 0.0002, "epoch": 4.262899262899263, "step": 3470}, {"loss": 0.3761, "grad_norm": 0.8295119404792786, "learning_rate": 0.0002, "epoch": 4.275184275184275, "step": 3480}, {"loss": 0.3895, "grad_norm": 1.0634245872497559, "learning_rate": 0.0002, "epoch": 4.287469287469287, "step": 3490}, {"loss": 0.3955, "grad_norm": 0.9554621577262878, "learning_rate": 0.0002, "epoch": 4.2997542997543, "step": 3500}, {"loss": 0.3826, "grad_norm": 1.0191723108291626, "learning_rate": 0.0002, "epoch": 4.312039312039312, "step": 3510}, {"loss": 0.3828, "grad_norm": 0.8573611378669739, "learning_rate": 0.0002, "epoch": 4.324324324324325, "step": 3520}, {"loss": 0.3869, "grad_norm": 0.9082390069961548, "learning_rate": 0.0002, "epoch": 4.336609336609337, "step": 3530}, {"loss": 0.3902, "grad_norm": 0.8650212287902832, "learning_rate": 0.0002, "epoch": 4.348894348894349, "step": 3540}, {"loss": 0.3915, "grad_norm": 0.7186297178268433, "learning_rate": 0.0002, "epoch": 4.361179361179361, "step": 3550}, {"loss": 0.3861, "grad_norm": 0.9750986695289612, "learning_rate": 0.0002, "epoch": 4.3734643734643734, "step": 3560}, {"loss": 0.3967, "grad_norm": 1.0710467100143433, "learning_rate": 0.0002, "epoch": 4.385749385749386, "step": 3570}, {"loss": 0.3774, "grad_norm": 0.7974869012832642, "learning_rate": 0.0002, "epoch": 4.398034398034398, "step": 3580}, {"loss": 0.3738, "grad_norm": 0.9405913949012756, "learning_rate": 0.0002, "epoch": 4.41031941031941, "step": 3590}, {"loss": 0.3982, "grad_norm": 0.9393602609634399, "learning_rate": 0.0002, "epoch": 4.422604422604422, "step": 3600}, {"loss": 0.3913, "grad_norm": 1.0798007249832153, "learning_rate": 0.0002, "epoch": 4.434889434889435, "step": 3610}, {"loss": 0.3682, "grad_norm": 0.9226186275482178, "learning_rate": 0.0002, "epoch": 4.447174447174447, "step": 3620}, {"loss": 0.3742, "grad_norm": 1.1046524047851562, "learning_rate": 0.0002, "epoch": 4.45945945945946, "step": 3630}, {"loss": 0.3886, "grad_norm": 0.8848567605018616, "learning_rate": 0.0002, "epoch": 4.471744471744472, "step": 3640}, {"loss": 0.3848, "grad_norm": 0.8913224339485168, "learning_rate": 0.0002, "epoch": 4.484029484029484, "step": 3650}, {"loss": 0.3731, "grad_norm": 0.8497583270072937, "learning_rate": 0.0002, "epoch": 4.496314496314496, "step": 3660}, {"loss": 0.3804, "grad_norm": 0.8263831734657288, "learning_rate": 0.0002, "epoch": 4.5085995085995085, "step": 3670}, {"loss": 0.3815, "grad_norm": 0.8470269441604614, "learning_rate": 0.0002, "epoch": 4.520884520884521, "step": 3680}, {"loss": 0.3774, "grad_norm": 0.860038161277771, "learning_rate": 0.0002, "epoch": 4.533169533169533, "step": 3690}, {"loss": 0.3817, "grad_norm": 0.8898552656173706, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 3700}, {"loss": 0.3776, "grad_norm": 0.8152070641517639, "learning_rate": 0.0002, "epoch": 4.557739557739557, "step": 3710}, {"loss": 0.383, "grad_norm": 0.7847675085067749, "learning_rate": 0.0002, "epoch": 4.57002457002457, "step": 3720}, {"loss": 0.3791, "grad_norm": 0.9625533819198608, "learning_rate": 0.0002, "epoch": 4.582309582309582, "step": 3730}, {"loss": 0.3699, "grad_norm": 0.9097456336021423, "learning_rate": 0.0002, "epoch": 4.594594594594595, "step": 3740}, {"loss": 0.3673, "grad_norm": 0.871329128742218, "learning_rate": 0.0002, "epoch": 4.606879606879607, "step": 3750}, {"loss": 0.3725, "grad_norm": 0.9879975914955139, "learning_rate": 0.0002, "epoch": 4.61916461916462, "step": 3760}, {"loss": 0.3827, "grad_norm": 0.8636731505393982, "learning_rate": 0.0002, "epoch": 4.631449631449631, "step": 3770}, {"loss": 0.3755, "grad_norm": 1.0488964319229126, "learning_rate": 0.0002, "epoch": 4.643734643734644, "step": 3780}, {"loss": 0.3738, "grad_norm": 0.7637056112289429, "learning_rate": 0.0002, "epoch": 4.656019656019656, "step": 3790}, {"loss": 0.3676, "grad_norm": 0.8507546186447144, "learning_rate": 0.0002, "epoch": 4.6683046683046685, "step": 3800}, {"loss": 0.3852, "grad_norm": 1.0216856002807617, "learning_rate": 0.0002, "epoch": 4.680589680589681, "step": 3810}, {"loss": 0.3751, "grad_norm": 1.026343822479248, "learning_rate": 0.0002, "epoch": 4.6928746928746925, "step": 3820}, {"loss": 0.3687, "grad_norm": 0.8311620950698853, "learning_rate": 0.0002, "epoch": 4.705159705159705, "step": 3830}, {"loss": 0.3771, "grad_norm": 0.7770653367042542, "learning_rate": 0.0002, "epoch": 4.717444717444717, "step": 3840}, {"loss": 0.37, "grad_norm": 0.7616215348243713, "learning_rate": 0.0002, "epoch": 4.72972972972973, "step": 3850}, {"loss": 0.3927, "grad_norm": 1.0377072095870972, "learning_rate": 0.0002, "epoch": 4.742014742014742, "step": 3860}, {"loss": 0.3832, "grad_norm": 0.9713505506515503, "learning_rate": 0.0002, "epoch": 4.754299754299755, "step": 3870}, {"loss": 0.3722, "grad_norm": 0.8803321719169617, "learning_rate": 0.0002, "epoch": 4.766584766584766, "step": 3880}, {"loss": 0.3756, "grad_norm": 0.885535478591919, "learning_rate": 0.0002, "epoch": 4.778869778869779, "step": 3890}, {"loss": 0.3714, "grad_norm": 1.0877983570098877, "learning_rate": 0.0002, "epoch": 4.791154791154791, "step": 3900}, {"loss": 0.3879, "grad_norm": 0.7875366806983948, "learning_rate": 0.0002, "epoch": 4.803439803439804, "step": 3910}, {"loss": 0.3591, "grad_norm": 0.8550102114677429, "learning_rate": 0.0002, "epoch": 4.815724815724816, "step": 3920}, {"loss": 0.3716, "grad_norm": 1.0217846632003784, "learning_rate": 0.0002, "epoch": 4.828009828009828, "step": 3930}, {"loss": 0.3649, "grad_norm": 0.7315713167190552, "learning_rate": 0.0002, "epoch": 4.84029484029484, "step": 3940}, {"loss": 0.3879, "grad_norm": 0.8924923539161682, "learning_rate": 0.0002, "epoch": 4.8525798525798525, "step": 3950}, {"loss": 0.3669, "grad_norm": 0.9730218052864075, "learning_rate": 0.0002, "epoch": 4.864864864864865, "step": 3960}, {"loss": 0.3705, "grad_norm": 0.9202003479003906, "learning_rate": 0.0002, "epoch": 4.877149877149877, "step": 3970}, {"loss": 0.3617, "grad_norm": 0.8173081874847412, "learning_rate": 0.0002, "epoch": 4.88943488943489, "step": 3980}, {"loss": 0.37, "grad_norm": 0.7178564667701721, "learning_rate": 0.0002, "epoch": 4.901719901719901, "step": 3990}, {"loss": 0.3768, "grad_norm": 0.913684606552124, "learning_rate": 0.0002, "epoch": 4.914004914004914, "step": 4000}, {"loss": 0.3755, "grad_norm": 0.8817896842956543, "learning_rate": 0.0002, "epoch": 4.926289926289926, "step": 4010}, {"loss": 0.3676, "grad_norm": 0.7652186751365662, "learning_rate": 0.0002, "epoch": 4.938574938574939, "step": 4020}, {"loss": 0.3699, "grad_norm": 0.8828630447387695, "learning_rate": 0.0002, "epoch": 4.950859950859951, "step": 4030}, {"loss": 0.3672, "grad_norm": 1.0878605842590332, "learning_rate": 0.0002, "epoch": 4.963144963144963, "step": 4040}, {"loss": 0.3656, "grad_norm": 1.0845288038253784, "learning_rate": 0.0002, "epoch": 4.975429975429975, "step": 4050}, {"loss": 0.365, "grad_norm": 0.8431115746498108, "learning_rate": 0.0002, "epoch": 4.987714987714988, "step": 4060}, {"loss": 0.3693, "grad_norm": 0.8320387601852417, "learning_rate": 0.0002, "epoch": 5.0, "step": 4070}, {"eval_loss": 0.4017423093318939, "eval_runtime": 20.8466, "eval_samples_per_second": 15.878, "eval_steps_per_second": 2.015, "epoch": 5.0, "step": 4070}, {"loss": 0.3425, "grad_norm": 0.8639023900032043, "learning_rate": 0.0002, "epoch": 5.012285012285012, "step": 4080}, {"loss": 0.3458, "grad_norm": 0.7123713493347168, "learning_rate": 0.0002, "epoch": 5.024570024570025, "step": 4090}, {"loss": 0.3404, "grad_norm": 0.9886922836303711, "learning_rate": 0.0002, "epoch": 5.036855036855036, "step": 4100}, {"loss": 0.3529, "grad_norm": 0.7880306243896484, "learning_rate": 0.0002, "epoch": 5.049140049140049, "step": 4110}, {"loss": 0.3406, "grad_norm": 0.7488741874694824, "learning_rate": 0.0002, "epoch": 5.061425061425061, "step": 4120}, {"loss": 0.3542, "grad_norm": 0.9359086751937866, "learning_rate": 0.0002, "epoch": 5.073710073710074, "step": 4130}, {"loss": 0.3471, "grad_norm": 0.9401527047157288, "learning_rate": 0.0002, "epoch": 5.085995085995086, "step": 4140}, {"loss": 0.3566, "grad_norm": 0.8396275043487549, "learning_rate": 0.0002, "epoch": 5.098280098280099, "step": 4150}, {"loss": 0.3416, "grad_norm": 0.7132664918899536, "learning_rate": 0.0002, "epoch": 5.11056511056511, "step": 4160}, {"loss": 0.3457, "grad_norm": 0.843708872795105, "learning_rate": 0.0002, "epoch": 5.122850122850123, "step": 4170}, {"loss": 0.3399, "grad_norm": 0.8733304738998413, "learning_rate": 0.0002, "epoch": 5.135135135135135, "step": 4180}, {"loss": 0.3501, "grad_norm": 0.9064375162124634, "learning_rate": 0.0002, "epoch": 5.1474201474201475, "step": 4190}, {"loss": 0.3455, "grad_norm": 0.900770902633667, "learning_rate": 0.0002, "epoch": 5.15970515970516, "step": 4200}, {"loss": 0.3475, "grad_norm": 0.863853394985199, "learning_rate": 0.0002, "epoch": 5.171990171990172, "step": 4210}, {"loss": 0.3497, "grad_norm": 0.767134964466095, "learning_rate": 0.0002, "epoch": 5.184275184275184, "step": 4220}, {"loss": 0.3527, "grad_norm": 0.7518735527992249, "learning_rate": 0.0002, "epoch": 5.196560196560196, "step": 4230}, {"loss": 0.3369, "grad_norm": 0.8040947914123535, "learning_rate": 0.0002, "epoch": 5.208845208845209, "step": 4240}, {"loss": 0.3496, "grad_norm": 0.7827144265174866, "learning_rate": 0.0002, "epoch": 5.221130221130221, "step": 4250}, {"loss": 0.3442, "grad_norm": 0.7306333184242249, "learning_rate": 0.0002, "epoch": 5.233415233415234, "step": 4260}, {"loss": 0.3553, "grad_norm": 1.0963380336761475, "learning_rate": 0.0002, "epoch": 5.245700245700245, "step": 4270}, {"loss": 0.3462, "grad_norm": 0.8200454711914062, "learning_rate": 0.0002, "epoch": 5.257985257985258, "step": 4280}, {"loss": 0.3509, "grad_norm": 0.8666796684265137, "learning_rate": 0.0002, "epoch": 5.27027027027027, "step": 4290}, {"loss": 0.3423, "grad_norm": 0.7862894535064697, "learning_rate": 0.0002, "epoch": 5.282555282555283, "step": 4300}, {"loss": 0.3623, "grad_norm": 0.8163095712661743, "learning_rate": 0.0002, "epoch": 5.294840294840295, "step": 4310}, {"loss": 0.34, "grad_norm": 0.8069050908088684, "learning_rate": 0.0002, "epoch": 5.3071253071253075, "step": 4320}, {"loss": 0.3532, "grad_norm": 0.7858486175537109, "learning_rate": 0.0002, "epoch": 5.319410319410319, "step": 4330}, {"loss": 0.3435, "grad_norm": 0.950339674949646, "learning_rate": 0.0002, "epoch": 5.3316953316953315, "step": 4340}, {"loss": 0.3498, "grad_norm": 0.9056477546691895, "learning_rate": 0.0002, "epoch": 5.343980343980344, "step": 4350}, {"loss": 0.3538, "grad_norm": 0.9619399905204773, "learning_rate": 0.0002, "epoch": 5.356265356265356, "step": 4360}, {"loss": 0.3455, "grad_norm": 0.9778652191162109, "learning_rate": 0.0002, "epoch": 5.368550368550369, "step": 4370}, {"loss": 0.3498, "grad_norm": 0.6919555068016052, "learning_rate": 0.0002, "epoch": 5.38083538083538, "step": 4380}, {"loss": 0.3426, "grad_norm": 0.8121668696403503, "learning_rate": 0.0002, "epoch": 5.393120393120393, "step": 4390}, {"loss": 0.3442, "grad_norm": 0.8481289148330688, "learning_rate": 0.0002, "epoch": 5.405405405405405, "step": 4400}, {"loss": 0.345, "grad_norm": 0.8727408647537231, "learning_rate": 0.0002, "epoch": 5.417690417690418, "step": 4410}, {"loss": 0.3554, "grad_norm": 0.8920271396636963, "learning_rate": 0.0002, "epoch": 5.42997542997543, "step": 4420}, {"loss": 0.3409, "grad_norm": 0.7758749723434448, "learning_rate": 0.0002, "epoch": 5.442260442260443, "step": 4430}, {"loss": 0.3483, "grad_norm": 0.8847506642341614, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 4440}, {"loss": 0.3557, "grad_norm": 0.9760470390319824, "learning_rate": 0.0002, "epoch": 5.466830466830467, "step": 4450}, {"loss": 0.3536, "grad_norm": 0.8940271139144897, "learning_rate": 0.0002, "epoch": 5.479115479115479, "step": 4460}, {"loss": 0.3577, "grad_norm": 0.8668502569198608, "learning_rate": 0.0002, "epoch": 5.4914004914004915, "step": 4470}, {"loss": 0.3462, "grad_norm": 0.9097439050674438, "learning_rate": 0.0002, "epoch": 5.503685503685504, "step": 4480}, {"loss": 0.3417, "grad_norm": 0.8217208981513977, "learning_rate": 0.0002, "epoch": 5.515970515970516, "step": 4490}, {"loss": 0.3482, "grad_norm": 0.7853189706802368, "learning_rate": 0.0002, "epoch": 5.528255528255528, "step": 4500}, {"loss": 0.3479, "grad_norm": 1.1113477945327759, "learning_rate": 0.0002, "epoch": 5.54054054054054, "step": 4510}, {"loss": 0.3553, "grad_norm": 0.8637538552284241, "learning_rate": 0.0002, "epoch": 5.552825552825553, "step": 4520}, {"loss": 0.3403, "grad_norm": 1.0230066776275635, "learning_rate": 0.0002, "epoch": 5.565110565110565, "step": 4530}, {"loss": 0.3588, "grad_norm": 0.8972793817520142, "learning_rate": 0.0002, "epoch": 5.577395577395578, "step": 4540}, {"loss": 0.3428, "grad_norm": 0.7950642704963684, "learning_rate": 0.0002, "epoch": 5.58968058968059, "step": 4550}, {"loss": 0.3468, "grad_norm": 1.113753318786621, "learning_rate": 0.0002, "epoch": 5.601965601965602, "step": 4560}, {"loss": 0.3354, "grad_norm": 0.7842669486999512, "learning_rate": 0.0002, "epoch": 5.614250614250614, "step": 4570}, {"loss": 0.3419, "grad_norm": 0.9713512063026428, "learning_rate": 0.0002, "epoch": 5.6265356265356266, "step": 4580}, {"loss": 0.3502, "grad_norm": 0.9451650977134705, "learning_rate": 0.0002, "epoch": 5.638820638820639, "step": 4590}, {"loss": 0.3416, "grad_norm": 1.055484414100647, "learning_rate": 0.0002, "epoch": 5.651105651105651, "step": 4600}, {"loss": 0.3436, "grad_norm": 0.8408507704734802, "learning_rate": 0.0002, "epoch": 5.663390663390663, "step": 4610}, {"loss": 0.3619, "grad_norm": 1.0293926000595093, "learning_rate": 0.0002, "epoch": 5.675675675675675, "step": 4620}, {"loss": 0.3484, "grad_norm": 0.7198245525360107, "learning_rate": 0.0002, "epoch": 5.687960687960688, "step": 4630}, {"loss": 0.3563, "grad_norm": 0.7564466595649719, "learning_rate": 0.0002, "epoch": 5.7002457002457, "step": 4640}, {"loss": 0.3435, "grad_norm": 0.7980002760887146, "learning_rate": 0.0002, "epoch": 5.712530712530713, "step": 4650}, {"loss": 0.3478, "grad_norm": 0.8685088753700256, "learning_rate": 0.0002, "epoch": 5.724815724815725, "step": 4660}, {"loss": 0.3692, "grad_norm": 0.8816949129104614, "learning_rate": 0.0002, "epoch": 5.737100737100737, "step": 4670}, {"loss": 0.3462, "grad_norm": 0.7154731750488281, "learning_rate": 0.0002, "epoch": 5.749385749385749, "step": 4680}, {"loss": 0.3503, "grad_norm": 0.9430679678916931, "learning_rate": 0.0002, "epoch": 5.761670761670762, "step": 4690}, {"loss": 0.3439, "grad_norm": 0.7640151381492615, "learning_rate": 0.0002, "epoch": 5.773955773955774, "step": 4700}, {"loss": 0.3444, "grad_norm": 1.0920690298080444, "learning_rate": 0.0002, "epoch": 5.7862407862407865, "step": 4710}, {"loss": 0.3356, "grad_norm": 0.9362104535102844, "learning_rate": 0.0002, "epoch": 5.798525798525798, "step": 4720}, {"loss": 0.339, "grad_norm": 0.8392294645309448, "learning_rate": 0.0002, "epoch": 5.8108108108108105, "step": 4730}, {"loss": 0.3488, "grad_norm": 0.9893582463264465, "learning_rate": 0.0002, "epoch": 5.823095823095823, "step": 4740}, {"loss": 0.3446, "grad_norm": 0.6985510587692261, "learning_rate": 0.0002, "epoch": 5.835380835380835, "step": 4750}, {"loss": 0.3534, "grad_norm": 0.8906862735748291, "learning_rate": 0.0002, "epoch": 5.847665847665848, "step": 4760}, {"loss": 0.3481, "grad_norm": 0.8036413192749023, "learning_rate": 0.0002, "epoch": 5.85995085995086, "step": 4770}, {"loss": 0.3326, "grad_norm": 0.9948155283927917, "learning_rate": 0.0002, "epoch": 5.872235872235873, "step": 4780}, {"loss": 0.3385, "grad_norm": 0.8618432283401489, "learning_rate": 0.0002, "epoch": 5.884520884520884, "step": 4790}, {"loss": 0.3302, "grad_norm": 1.0422909259796143, "learning_rate": 0.0002, "epoch": 5.896805896805897, "step": 4800}, {"loss": 0.3448, "grad_norm": 1.1892569065093994, "learning_rate": 0.0002, "epoch": 5.909090909090909, "step": 4810}, {"loss": 0.3506, "grad_norm": 1.1459916830062866, "learning_rate": 0.0002, "epoch": 5.921375921375922, "step": 4820}, {"loss": 0.3387, "grad_norm": 1.056235909461975, "learning_rate": 0.0002, "epoch": 5.933660933660933, "step": 4830}, {"loss": 0.344, "grad_norm": 0.8517277240753174, "learning_rate": 0.0002, "epoch": 5.945945945945946, "step": 4840}, {"loss": 0.3421, "grad_norm": 0.8153380751609802, "learning_rate": 0.0002, "epoch": 5.958230958230958, "step": 4850}, {"loss": 0.3409, "grad_norm": 0.7907533049583435, "learning_rate": 0.0002, "epoch": 5.9705159705159705, "step": 4860}, {"loss": 0.3337, "grad_norm": 0.8443069458007812, "learning_rate": 0.0002, "epoch": 5.982800982800983, "step": 4870}, {"loss": 0.3351, "grad_norm": 0.8711344003677368, "learning_rate": 0.0002, "epoch": 5.995085995085995, "step": 4880}, {"eval_loss": 0.3778059184551239, "eval_runtime": 20.6858, "eval_samples_per_second": 16.001, "eval_steps_per_second": 2.03, "epoch": 6.0, "step": 4884}, {"loss": 0.3244, "grad_norm": 0.7697948813438416, "learning_rate": 0.0002, "epoch": 6.007371007371007, "step": 4890}, {"loss": 0.3118, "grad_norm": 0.7734108567237854, "learning_rate": 0.0002, "epoch": 6.019656019656019, "step": 4900}, {"loss": 0.3242, "grad_norm": 0.7173922657966614, "learning_rate": 0.0002, "epoch": 6.031941031941032, "step": 4910}, {"loss": 0.3159, "grad_norm": 1.062118649482727, "learning_rate": 0.0002, "epoch": 6.044226044226044, "step": 4920}, {"loss": 0.3361, "grad_norm": 0.746422529220581, "learning_rate": 0.0002, "epoch": 6.056511056511057, "step": 4930}, {"loss": 0.3204, "grad_norm": 0.8549448251724243, "learning_rate": 0.0002, "epoch": 6.068796068796069, "step": 4940}, {"loss": 0.3236, "grad_norm": 0.9405432939529419, "learning_rate": 0.0002, "epoch": 6.081081081081081, "step": 4950}, {"loss": 0.3278, "grad_norm": 0.752382755279541, "learning_rate": 0.0002, "epoch": 6.093366093366093, "step": 4960}, {"loss": 0.3204, "grad_norm": 0.820332407951355, "learning_rate": 0.0002, "epoch": 6.105651105651106, "step": 4970}, {"loss": 0.3192, "grad_norm": 0.8701449036598206, "learning_rate": 0.0002, "epoch": 6.117936117936118, "step": 4980}, {"loss": 0.321, "grad_norm": 0.8192865252494812, "learning_rate": 0.0002, "epoch": 6.1302211302211305, "step": 4990}, {"loss": 0.3295, "grad_norm": 1.0016303062438965, "learning_rate": 0.0002, "epoch": 6.142506142506143, "step": 5000}, {"loss": 0.3352, "grad_norm": 0.9194409251213074, "learning_rate": 0.0002, "epoch": 6.1547911547911545, "step": 5010}, {"loss": 0.3205, "grad_norm": 0.9319757223129272, "learning_rate": 0.0002, "epoch": 6.167076167076167, "step": 5020}, {"loss": 0.3256, "grad_norm": 0.8737656474113464, "learning_rate": 0.0002, "epoch": 6.179361179361179, "step": 5030}, {"loss": 0.3221, "grad_norm": 0.8736537098884583, "learning_rate": 0.0002, "epoch": 6.191646191646192, "step": 5040}, {"loss": 0.3265, "grad_norm": 0.9301430583000183, "learning_rate": 0.0002, "epoch": 6.203931203931204, "step": 5050}, {"loss": 0.3285, "grad_norm": 0.7717130780220032, "learning_rate": 0.0002, "epoch": 6.216216216216216, "step": 5060}, {"loss": 0.3192, "grad_norm": 0.6709604859352112, "learning_rate": 0.0002, "epoch": 6.228501228501228, "step": 5070}, {"loss": 0.3352, "grad_norm": 0.879374086856842, "learning_rate": 0.0002, "epoch": 6.240786240786241, "step": 5080}, {"loss": 0.329, "grad_norm": 0.9136955738067627, "learning_rate": 0.0002, "epoch": 6.253071253071253, "step": 5090}, {"loss": 0.3228, "grad_norm": 0.795177161693573, "learning_rate": 0.0002, "epoch": 6.2653562653562656, "step": 5100}, {"loss": 0.3273, "grad_norm": 1.0412259101867676, "learning_rate": 0.0002, "epoch": 6.277641277641278, "step": 5110}, {"loss": 0.3221, "grad_norm": 0.7382524013519287, "learning_rate": 0.0002, "epoch": 6.2899262899262895, "step": 5120}, {"loss": 0.3102, "grad_norm": 0.8818480968475342, "learning_rate": 0.0002, "epoch": 6.302211302211302, "step": 5130}, {"loss": 0.3316, "grad_norm": 0.7865153551101685, "learning_rate": 0.0002, "epoch": 6.314496314496314, "step": 5140}, {"loss": 0.3264, "grad_norm": 0.9166486859321594, "learning_rate": 0.0002, "epoch": 6.326781326781327, "step": 5150}, {"loss": 0.33, "grad_norm": 0.6655149459838867, "learning_rate": 0.0002, "epoch": 6.339066339066339, "step": 5160}, {"loss": 0.3359, "grad_norm": 0.7762818336486816, "learning_rate": 0.0002, "epoch": 6.351351351351352, "step": 5170}, {"loss": 0.3244, "grad_norm": 0.8057235479354858, "learning_rate": 0.0002, "epoch": 6.363636363636363, "step": 5180}, {"loss": 0.3167, "grad_norm": 0.8186984062194824, "learning_rate": 0.0002, "epoch": 6.375921375921376, "step": 5190}, {"loss": 0.3289, "grad_norm": 0.8669573068618774, "learning_rate": 0.0002, "epoch": 6.388206388206388, "step": 5200}, {"loss": 0.3313, "grad_norm": 0.8904402852058411, "learning_rate": 0.0002, "epoch": 6.400491400491401, "step": 5210}, {"loss": 0.3187, "grad_norm": 0.9250359535217285, "learning_rate": 0.0002, "epoch": 6.412776412776413, "step": 5220}, {"loss": 0.3229, "grad_norm": 0.8718299269676208, "learning_rate": 0.0002, "epoch": 6.4250614250614255, "step": 5230}, {"loss": 0.3214, "grad_norm": 0.8156430125236511, "learning_rate": 0.0002, "epoch": 6.437346437346437, "step": 5240}, {"loss": 0.3244, "grad_norm": 0.7759218215942383, "learning_rate": 0.0002, "epoch": 6.4496314496314495, "step": 5250}, {"loss": 0.3298, "grad_norm": 0.8137310743331909, "learning_rate": 0.0002, "epoch": 6.461916461916462, "step": 5260}, {"loss": 0.3275, "grad_norm": 0.8121917843818665, "learning_rate": 0.0002, "epoch": 6.474201474201474, "step": 5270}, {"loss": 0.3201, "grad_norm": 0.8178010582923889, "learning_rate": 0.0002, "epoch": 6.486486486486487, "step": 5280}, {"loss": 0.3271, "grad_norm": 1.1806302070617676, "learning_rate": 0.0002, "epoch": 6.498771498771498, "step": 5290}, {"loss": 0.3231, "grad_norm": 0.8255127668380737, "learning_rate": 0.0002, "epoch": 6.511056511056511, "step": 5300}, {"loss": 0.3227, "grad_norm": 0.8006690740585327, "learning_rate": 0.0002, "epoch": 6.523341523341523, "step": 5310}, {"loss": 0.3262, "grad_norm": 0.9932374358177185, "learning_rate": 0.0002, "epoch": 6.535626535626536, "step": 5320}, {"loss": 0.3291, "grad_norm": 0.8973969221115112, "learning_rate": 0.0002, "epoch": 6.547911547911548, "step": 5330}, {"loss": 0.3146, "grad_norm": 0.7359915971755981, "learning_rate": 0.0002, "epoch": 6.560196560196561, "step": 5340}, {"loss": 0.3308, "grad_norm": 0.9941133856773376, "learning_rate": 0.0002, "epoch": 6.572481572481572, "step": 5350}, {"loss": 0.3202, "grad_norm": 0.9008874893188477, "learning_rate": 0.0002, "epoch": 6.584766584766585, "step": 5360}, {"loss": 0.3271, "grad_norm": 1.309710144996643, "learning_rate": 0.0002, "epoch": 6.597051597051597, "step": 5370}, {"loss": 0.3177, "grad_norm": 0.797768235206604, "learning_rate": 0.0002, "epoch": 6.6093366093366095, "step": 5380}, {"loss": 0.3218, "grad_norm": 0.8507353663444519, "learning_rate": 0.0002, "epoch": 6.621621621621622, "step": 5390}, {"loss": 0.3204, "grad_norm": 0.9628674983978271, "learning_rate": 0.0002, "epoch": 6.6339066339066335, "step": 5400}, {"loss": 0.3155, "grad_norm": 0.6989983320236206, "learning_rate": 0.0002, "epoch": 6.646191646191646, "step": 5410}, {"loss": 0.3197, "grad_norm": 0.9505863189697266, "learning_rate": 0.0002, "epoch": 6.658476658476658, "step": 5420}, {"loss": 0.3259, "grad_norm": 0.8058171272277832, "learning_rate": 0.0002, "epoch": 6.670761670761671, "step": 5430}, {"loss": 0.3248, "grad_norm": 0.8476499915122986, "learning_rate": 0.0002, "epoch": 6.683046683046683, "step": 5440}, {"loss": 0.326, "grad_norm": 0.8503309488296509, "learning_rate": 0.0002, "epoch": 6.695331695331696, "step": 5450}, {"loss": 0.3218, "grad_norm": 0.919566810131073, "learning_rate": 0.0002, "epoch": 6.707616707616707, "step": 5460}, {"loss": 0.3218, "grad_norm": 0.7741201519966125, "learning_rate": 0.0002, "epoch": 6.71990171990172, "step": 5470}, {"loss": 0.329, "grad_norm": 0.8432701826095581, "learning_rate": 0.0002, "epoch": 6.732186732186732, "step": 5480}, {"loss": 0.3284, "grad_norm": 1.0183148384094238, "learning_rate": 0.0002, "epoch": 6.744471744471745, "step": 5490}, {"loss": 0.3312, "grad_norm": 0.8491143584251404, "learning_rate": 0.0002, "epoch": 6.756756756756757, "step": 5500}, {"loss": 0.3208, "grad_norm": 0.9586310386657715, "learning_rate": 0.0002, "epoch": 6.769041769041769, "step": 5510}, {"loss": 0.3305, "grad_norm": 0.7936097383499146, "learning_rate": 0.0002, "epoch": 6.781326781326781, "step": 5520}, {"loss": 0.318, "grad_norm": 0.7875059247016907, "learning_rate": 0.0002, "epoch": 6.7936117936117935, "step": 5530}, {"loss": 0.3234, "grad_norm": 0.8136157393455505, "learning_rate": 0.0002, "epoch": 6.805896805896806, "step": 5540}, {"loss": 0.3161, "grad_norm": 0.837213933467865, "learning_rate": 0.0002, "epoch": 6.818181818181818, "step": 5550}, {"loss": 0.3153, "grad_norm": 0.6812925338745117, "learning_rate": 0.0002, "epoch": 6.830466830466831, "step": 5560}, {"loss": 0.3139, "grad_norm": 0.7309592962265015, "learning_rate": 0.0002, "epoch": 6.842751842751843, "step": 5570}, {"loss": 0.3126, "grad_norm": 0.6905979514122009, "learning_rate": 0.0002, "epoch": 6.855036855036855, "step": 5580}, {"loss": 0.3291, "grad_norm": 1.1768406629562378, "learning_rate": 0.0002, "epoch": 6.867321867321867, "step": 5590}, {"loss": 0.3193, "grad_norm": 0.7618567943572998, "learning_rate": 0.0002, "epoch": 6.87960687960688, "step": 5600}, {"loss": 0.3296, "grad_norm": 0.7930929660797119, "learning_rate": 0.0002, "epoch": 6.891891891891892, "step": 5610}, {"loss": 0.3241, "grad_norm": 0.7931787371635437, "learning_rate": 0.0002, "epoch": 6.9041769041769046, "step": 5620}, {"loss": 0.3215, "grad_norm": 0.6366972923278809, "learning_rate": 0.0002, "epoch": 6.916461916461916, "step": 5630}, {"loss": 0.3264, "grad_norm": 0.7782737612724304, "learning_rate": 0.0002, "epoch": 6.9287469287469285, "step": 5640}, {"loss": 0.3186, "grad_norm": 0.8643787503242493, "learning_rate": 0.0002, "epoch": 6.941031941031941, "step": 5650}, {"loss": 0.3285, "grad_norm": 1.0843733549118042, "learning_rate": 0.0002, "epoch": 6.953316953316953, "step": 5660}, {"loss": 0.3163, "grad_norm": 0.71319180727005, "learning_rate": 0.0002, "epoch": 6.965601965601966, "step": 5670}, {"loss": 0.3196, "grad_norm": 0.976536750793457, "learning_rate": 0.0002, "epoch": 6.977886977886978, "step": 5680}, {"loss": 0.3255, "grad_norm": 0.9221968054771423, "learning_rate": 0.0002, "epoch": 6.99017199017199, "step": 5690}, {"eval_loss": 0.3616626560688019, "eval_runtime": 20.8747, "eval_samples_per_second": 15.857, "eval_steps_per_second": 2.012, "epoch": 7.0, "step": 5698}, {"loss": 0.3149, "grad_norm": 0.6302434802055359, "learning_rate": 0.0002, "epoch": 7.002457002457002, "step": 5700}, {"loss": 0.3017, "grad_norm": 0.7077583074569702, "learning_rate": 0.0002, "epoch": 7.014742014742015, "step": 5710}, {"loss": 0.303, "grad_norm": 0.7005309462547302, "learning_rate": 0.0002, "epoch": 7.027027027027027, "step": 5720}, {"loss": 0.3069, "grad_norm": 0.7724815607070923, "learning_rate": 0.0002, "epoch": 7.03931203931204, "step": 5730}, {"loss": 0.3002, "grad_norm": 0.6469350457191467, "learning_rate": 0.0002, "epoch": 7.051597051597051, "step": 5740}, {"loss": 0.3046, "grad_norm": 0.8406739234924316, "learning_rate": 0.0002, "epoch": 7.063882063882064, "step": 5750}, {"loss": 0.3108, "grad_norm": 0.9954310059547424, "learning_rate": 0.0002, "epoch": 7.076167076167076, "step": 5760}, {"loss": 0.3076, "grad_norm": 0.7063487768173218, "learning_rate": 0.0002, "epoch": 7.0884520884520885, "step": 5770}, {"loss": 0.3033, "grad_norm": 0.8696660995483398, "learning_rate": 0.0002, "epoch": 7.100737100737101, "step": 5780}, {"loss": 0.3049, "grad_norm": 0.8088991045951843, "learning_rate": 0.0002, "epoch": 7.113022113022113, "step": 5790}, {"loss": 0.3042, "grad_norm": 0.6934662461280823, "learning_rate": 0.0002, "epoch": 7.125307125307125, "step": 5800}, {"loss": 0.3068, "grad_norm": 0.7482573390007019, "learning_rate": 0.0002, "epoch": 7.137592137592137, "step": 5810}, {"loss": 0.3001, "grad_norm": 1.0848287343978882, "learning_rate": 0.0002, "epoch": 7.14987714987715, "step": 5820}, {"loss": 0.3017, "grad_norm": 0.8017896413803101, "learning_rate": 0.0002, "epoch": 7.162162162162162, "step": 5830}, {"loss": 0.3051, "grad_norm": 0.6418949365615845, "learning_rate": 0.0002, "epoch": 7.174447174447175, "step": 5840}, {"loss": 0.3032, "grad_norm": 0.666072428226471, "learning_rate": 0.0002, "epoch": 7.186732186732186, "step": 5850}, {"loss": 0.3131, "grad_norm": 0.7549816370010376, "learning_rate": 0.0002, "epoch": 7.199017199017199, "step": 5860}, {"loss": 0.3088, "grad_norm": 0.8756735920906067, "learning_rate": 0.0002, "epoch": 7.211302211302211, "step": 5870}, {"loss": 0.3022, "grad_norm": 0.6790788769721985, "learning_rate": 0.0002, "epoch": 7.223587223587224, "step": 5880}, {"loss": 0.3011, "grad_norm": 0.8388362526893616, "learning_rate": 0.0002, "epoch": 7.235872235872236, "step": 5890}, {"loss": 0.3204, "grad_norm": 0.8915345668792725, "learning_rate": 0.0002, "epoch": 7.2481572481572485, "step": 5900}, {"loss": 0.3086, "grad_norm": 0.9234250783920288, "learning_rate": 0.0002, "epoch": 7.26044226044226, "step": 5910}, {"loss": 0.2981, "grad_norm": 0.5452191233634949, "learning_rate": 0.0002, "epoch": 7.2727272727272725, "step": 5920}, {"loss": 0.3054, "grad_norm": 0.6100478172302246, "learning_rate": 0.0002, "epoch": 7.285012285012285, "step": 5930}, {"loss": 0.3073, "grad_norm": 0.6258270740509033, "learning_rate": 0.0002, "epoch": 7.297297297297297, "step": 5940}, {"loss": 0.3179, "grad_norm": 0.8540555834770203, "learning_rate": 0.0002, "epoch": 7.30958230958231, "step": 5950}, {"loss": 0.3109, "grad_norm": 0.8662564754486084, "learning_rate": 0.0002, "epoch": 7.321867321867322, "step": 5960}, {"loss": 0.3039, "grad_norm": 0.7404284477233887, "learning_rate": 0.0002, "epoch": 7.334152334152334, "step": 5970}, {"loss": 0.3036, "grad_norm": 0.7579419612884521, "learning_rate": 0.0002, "epoch": 7.346437346437346, "step": 5980}, {"loss": 0.3113, "grad_norm": 0.7248510122299194, "learning_rate": 0.0002, "epoch": 7.358722358722359, "step": 5990}, {"loss": 0.2987, "grad_norm": 0.8882181644439697, "learning_rate": 0.0002, "epoch": 7.371007371007371, "step": 6000}, {"loss": 0.3026, "grad_norm": 0.8494889736175537, "learning_rate": 0.0002, "epoch": 7.383292383292384, "step": 6010}, {"loss": 0.3147, "grad_norm": 0.8501948118209839, "learning_rate": 0.0002, "epoch": 7.395577395577396, "step": 6020}, {"loss": 0.3122, "grad_norm": 0.7228043079376221, "learning_rate": 0.0002, "epoch": 7.407862407862408, "step": 6030}, {"loss": 0.3092, "grad_norm": 0.7471523284912109, "learning_rate": 0.0002, "epoch": 7.42014742014742, "step": 6040}, {"loss": 0.307, "grad_norm": 0.810962975025177, "learning_rate": 0.0002, "epoch": 7.4324324324324325, "step": 6050}, {"loss": 0.3024, "grad_norm": 1.0621764659881592, "learning_rate": 0.0002, "epoch": 7.444717444717445, "step": 6060}, {"loss": 0.3019, "grad_norm": 0.72637939453125, "learning_rate": 0.0002, "epoch": 7.457002457002457, "step": 6070}, {"loss": 0.314, "grad_norm": 1.1550157070159912, "learning_rate": 0.0002, "epoch": 7.469287469287469, "step": 6080}, {"loss": 0.3088, "grad_norm": 0.865250825881958, "learning_rate": 0.0002, "epoch": 7.481572481572481, "step": 6090}, {"loss": 0.3041, "grad_norm": 0.8407077789306641, "learning_rate": 0.0002, "epoch": 7.493857493857494, "step": 6100}, {"loss": 0.3064, "grad_norm": 0.7295752167701721, "learning_rate": 0.0002, "epoch": 7.506142506142506, "step": 6110}, {"loss": 0.3061, "grad_norm": 0.9728897213935852, "learning_rate": 0.0002, "epoch": 7.518427518427519, "step": 6120}, {"loss": 0.3107, "grad_norm": 0.9776952862739563, "learning_rate": 0.0002, "epoch": 7.530712530712531, "step": 6130}, {"loss": 0.3148, "grad_norm": 0.704113245010376, "learning_rate": 0.0002, "epoch": 7.542997542997543, "step": 6140}, {"loss": 0.3012, "grad_norm": 0.9030590057373047, "learning_rate": 0.0002, "epoch": 7.555282555282555, "step": 6150}, {"loss": 0.3134, "grad_norm": 0.6629155874252319, "learning_rate": 0.0002, "epoch": 7.5675675675675675, "step": 6160}, {"loss": 0.3068, "grad_norm": 0.9348171353340149, "learning_rate": 0.0002, "epoch": 7.57985257985258, "step": 6170}, {"loss": 0.3065, "grad_norm": 0.9363399744033813, "learning_rate": 0.0002, "epoch": 7.592137592137592, "step": 6180}, {"loss": 0.3099, "grad_norm": 0.902718186378479, "learning_rate": 0.0002, "epoch": 7.604422604422604, "step": 6190}, {"loss": 0.3082, "grad_norm": 0.6992074251174927, "learning_rate": 0.0002, "epoch": 7.616707616707616, "step": 6200}, {"loss": 0.2941, "grad_norm": 0.7574757933616638, "learning_rate": 0.0002, "epoch": 7.628992628992629, "step": 6210}, {"loss": 0.3079, "grad_norm": 0.7717660069465637, "learning_rate": 0.0002, "epoch": 7.641277641277641, "step": 6220}, {"loss": 0.3005, "grad_norm": 0.7789981961250305, "learning_rate": 0.0002, "epoch": 7.653562653562654, "step": 6230}, {"loss": 0.3112, "grad_norm": 1.1020026206970215, "learning_rate": 0.0002, "epoch": 7.665847665847666, "step": 6240}, {"loss": 0.3087, "grad_norm": 0.7290350794792175, "learning_rate": 0.0002, "epoch": 7.678132678132678, "step": 6250}, {"loss": 0.3023, "grad_norm": 0.7291128039360046, "learning_rate": 0.0002, "epoch": 7.69041769041769, "step": 6260}, {"loss": 0.3019, "grad_norm": 0.7766857147216797, "learning_rate": 0.0002, "epoch": 7.702702702702703, "step": 6270}, {"loss": 0.3039, "grad_norm": 0.938277542591095, "learning_rate": 0.0002, "epoch": 7.714987714987715, "step": 6280}, {"loss": 0.3103, "grad_norm": 0.785190761089325, "learning_rate": 0.0002, "epoch": 7.7272727272727275, "step": 6290}, {"loss": 0.2938, "grad_norm": 0.7140066623687744, "learning_rate": 0.0002, "epoch": 7.739557739557739, "step": 6300}, {"loss": 0.3042, "grad_norm": 0.9476789236068726, "learning_rate": 0.0002, "epoch": 7.7518427518427515, "step": 6310}, {"loss": 0.3035, "grad_norm": 0.6404930949211121, "learning_rate": 0.0002, "epoch": 7.764127764127764, "step": 6320}, {"loss": 0.3065, "grad_norm": 0.6433947682380676, "learning_rate": 0.0002, "epoch": 7.776412776412776, "step": 6330}, {"loss": 0.3117, "grad_norm": 0.8289583921432495, "learning_rate": 0.0002, "epoch": 7.788697788697789, "step": 6340}, {"loss": 0.3057, "grad_norm": 1.098555088043213, "learning_rate": 0.0002, "epoch": 7.800982800982801, "step": 6350}, {"loss": 0.3104, "grad_norm": 0.7225303053855896, "learning_rate": 0.0002, "epoch": 7.813267813267814, "step": 6360}, {"loss": 0.3043, "grad_norm": 0.845711886882782, "learning_rate": 0.0002, "epoch": 7.825552825552825, "step": 6370}, {"loss": 0.3099, "grad_norm": 0.6199421882629395, "learning_rate": 0.0002, "epoch": 7.837837837837838, "step": 6380}, {"loss": 0.3095, "grad_norm": 0.7576995491981506, "learning_rate": 0.0002, "epoch": 7.85012285012285, "step": 6390}, {"loss": 0.311, "grad_norm": 0.6669192314147949, "learning_rate": 0.0002, "epoch": 7.862407862407863, "step": 6400}, {"loss": 0.2953, "grad_norm": 0.6896083354949951, "learning_rate": 0.0002, "epoch": 7.874692874692875, "step": 6410}, {"loss": 0.3, "grad_norm": 0.9418429732322693, "learning_rate": 0.0002, "epoch": 7.886977886977887, "step": 6420}, {"loss": 0.3074, "grad_norm": 0.7120184302330017, "learning_rate": 0.0002, "epoch": 7.899262899262899, "step": 6430}, {"loss": 0.3158, "grad_norm": 0.7420004606246948, "learning_rate": 0.0002, "epoch": 7.9115479115479115, "step": 6440}, {"loss": 0.3068, "grad_norm": 0.8989502191543579, "learning_rate": 0.0002, "epoch": 7.923832923832924, "step": 6450}, {"loss": 0.3102, "grad_norm": 0.715905487537384, "learning_rate": 0.0002, "epoch": 7.936117936117936, "step": 6460}, {"loss": 0.3018, "grad_norm": 0.8890138268470764, "learning_rate": 0.0002, "epoch": 7.948402948402949, "step": 6470}, {"loss": 0.3179, "grad_norm": 0.7992095351219177, "learning_rate": 0.0002, "epoch": 7.96068796068796, "step": 6480}, {"loss": 0.3115, "grad_norm": 0.9169677495956421, "learning_rate": 0.0002, "epoch": 7.972972972972973, "step": 6490}, {"loss": 0.3001, "grad_norm": 0.7911704778671265, "learning_rate": 0.0002, "epoch": 7.985257985257985, "step": 6500}, {"loss": 0.3085, "grad_norm": 0.8787347078323364, "learning_rate": 0.0002, "epoch": 7.997542997542998, "step": 6510}]}